nbroad commited on Feb 1

Commit

6e0489f

verified ·

1 Parent(s): b875fc2

Training in progress, step 100

Browse files

Files changed (22) hide show

.gitattributes +2 -0
.hydra/config.yaml +80 -0
.hydra/hydra.yaml +157 -0
.hydra/overrides.yaml +1 -0
adapter_config.json +37 -0
adapter_model.safetensors +3 -0
special_tokens_map.json +24 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
train.log +0 -0
train.py +252 -0
training_args.bin +3 -0
utils.py +0 -0
wandb/debug-internal.log +7 -0
wandb/debug.log +25 -0
wandb/run-20250201_230729-f0utp5v4/files/output.log +62 -0
wandb/run-20250201_230729-f0utp5v4/files/requirements.txt +228 -0
wandb/run-20250201_230729-f0utp5v4/files/wandb-metadata.json +45 -0
wandb/run-20250201_230729-f0utp5v4/logs/debug-core.log +6 -0
wandb/run-20250201_230729-f0utp5v4/logs/debug-internal.log +7 -0
wandb/run-20250201_230729-f0utp5v4/logs/debug.log +25 -0
wandb/run-20250201_230729-f0utp5v4/run-f0utp5v4.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250201_230729-f0utp5v4/run-f0utp5v4.wandb filter=lfs diff=lfs merge=lfs -text

.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+time_start: null
+DEBUG: false
+debug_model: unsloth/Qwen2.5-7B-bnb-4bit
+fold: 0
+random_seed: true
+train_on_all_folds: false
+eval_only: false
+merge_adapters: false
+wandb_id: null
+val_split_name: val
+pad_token: <pad>
+response_template_ids:
+- 4
+num_proc: 20
+hub_repo_tags:
+- odesia
+script_args:
+  dataset_name: nbroad/odesia-combined-v1
+  config: null
+  gradient_checkpointing_use_reentrant: true
+  ignore_bias_buffers: false
+model_config:
+  model_name_or_path: mistralai/Ministral-8B-Instruct-2410
+  torch_dtype: bfloat16
+  attn_implementation: flash_attention_2
+  use_peft: true
+  lora_r: 16
+  lora_alpha: 32
+  lora_dropout: 0.05
+  lora_target_modules:
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+  - up_proj
+  - down_proj
+  - gate_proj
+  lora_modules_to_save: null
+  lora_task_type: CAUSAL_LM
+  use_rslora: true
+  load_in_8bit: false
+  load_in_4bit: false
+  bnb_4bit_quant_type: nf4
+  use_bnb_nested_quant: true
+training_args:
+  resume_from_checkpoint: null
+  output_dir: ./
+  num_train_epochs: 1
+  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 8
+  warmup_ratio: 0.1
+  fp16: false
+  bf16: true
+  eval_strategy: steps
+  save_strategy: steps
+  eval_steps: 100
+  save_steps: 100
+  save_total_limit: 2
+  logging_steps: 2
+  run_name: null
+  weight_decay: 0.01
+  report_to: wandb
+  learning_rate: 6.0e-05
+  metric_for_best_model: loss
+  greater_is_better: false
+  gradient_checkpointing: true
+  gradient_accumulation_steps: 8
+  gradient_checkpointing_kwargs:
+    use_reentrant: true
+  optim: adamw_torch
+  dataloader_num_workers: 4
+  seed: 18
+  max_grad_norm: 2.0
+  load_best_model_at_end: true
+  push_to_hub: true
+  hub_private_repo: true
+  lr_scheduler_type: cosine
+  remove_unused_columns: false
+  ddp_find_unused_parameters: false
+  use_liger_kernel: true

.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: train
+    chdir: null
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: ministral.yaml
+    env_set:
+      WANDB_RUN_GROUP: clm
+      TOKENIZERS_PARALLELISM: 'False'
+      HF_HUB_ENABLE_HF_TRANSFER: '0'
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /home/ubuntu/odesia-2025/train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /home/ubuntu/odesia-2025/train/conf
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Ministral-8B-Instruct-2410",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": true
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72ef603437fd4c7865ad680b7a08163250634b533159f2956050c3481f855947
+size 174655536

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7edbeaf20dd7f571b5dd1c54d9ace4f9b6299127cc7ba2afb14a6d51a4a79a4
+size 17078136

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

train.log ADDED Viewed

File without changes

train.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+)
+from datasets import load_dataset
+from omegaconf import DictConfig, OmegaConf
+import hydra
+import wandb
+import shutil
+import os
+from functools import partial
+from pathlib import Path
+from trl import (
+    SFTTrainer,
+    ModelConfig,
+    get_quantization_config,
+    get_kbit_device_map,
+    get_peft_config,
+    DataCollatorForCompletionOnlyLM,
+)
+from dotenv import load_dotenv
+from peft import (
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    AutoPeftModelForSequenceClassification,
+)
+# from utils import add_metric_to_card
+loaded = load_dotenv("../.env", override=True)
+if not loaded:
+    raise ValueError("Failed to load .env file")
+def tokenize(example, tokenizer):
+    ids = tokenizer.apply_chat_template([
+        {"role": "user", "content": example["text"]},
+        {"role": "assistant", "content": example["response"]},
+    ])
+    return {
+        "input_ids": ids,
+    }
+@hydra.main(config_path="conf", config_name="q7b-4bit")
+def main(cfg: DictConfig):
+    cfg.time_start = "_".join(str(Path.cwd()).rsplit("/", 2)[-2:])
+    if cfg.DEBUG:
+        cfg.model_config.model_name_or_path = cfg.debug_model
+    script_args = cfg.script_args
+    training_args = TrainingArguments(**OmegaConf.to_container(cfg.training_args))
+    model_config = ModelConfig(**OmegaConf.to_container(cfg.model_config))
+    if training_args.process_index == 0:
+        if cfg.eval_only or training_args.resume_from_checkpoint is not None:
+            wandb_id = cfg.wandb_id
+            resume = "must"
+            config = None
+        else:
+            wandb_id = None
+            resume = None
+            config = OmegaConf.to_container(cfg)
+        wandb.init(config=config, id=wandb_id, resume=resume)
+        # copy current file to output, so it gets saved to hub
+        shutil.copy(
+            Path(__file__).resolve(),
+            Path(training_args.output_dir) / Path(__file__).name,
+        )
+        shutil.copy(
+            Path(__file__).resolve().parent / "utils.py",
+            Path(training_args.output_dir) / "utils.py",
+        )
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=model_config.torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+        cache_dir=os.environ["HF_HUB_CACHE"],
+    )
+    peft_config = get_peft_config(model_config)
+    if training_args.use_liger_kernel:
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_qwen2,
+            apply_liger_kernel_to_llama,
+            apply_liger_kernel_to_mistral,
+        )
+        apply_liger_kernel_to_qwen2()
+        apply_liger_kernel_to_llama()
+        apply_liger_kernel_to_mistral()
+    if cfg.eval_only:
+        model = AutoPeftModelForSequenceClassification.from_pretrained(
+            model_config.model_name_or_path,
+            **model_kwargs,
+            token=os.environ["HF_WRITE_PERSONAL"],
+        )
+        if cfg.merge_adapters:
+            model = model.merge_and_unload()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_config.model_name_or_path,
+            **model_kwargs,
+            token=os.environ["HF_GATED"],
+        )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_config.model_name_or_path,
+        use_fast=True,
+        token=os.environ["HF_GATED"],
+    )
+    tokenizer.padding_side = "left"
+    tokenizer.pad_token = cfg.pad_token
+    if not cfg.eval_only and model_config.load_in_4bit:
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            gradient_checkpointing_kwargs=training_args.gradient_checkpointing_kwargs,
+        )
+    elif not cfg.eval_only and training_args.gradient_checkpointing:
+        model.enable_input_require_grads()
+    if not cfg.eval_only:
+        model = get_peft_model(model, peft_config)
+    with training_args.main_process_first():
+        ds = load_dataset(
+            script_args.dataset_name,
+            script_args.config,
+            token=os.environ["HF_WRITE_PERSONAL"],
+        )
+        if cfg.DEBUG:
+            ds[cfg.train_split_name] = (
+                ds[cfg.train_split_name].shuffle().select(range(100))
+            )
+            ds[cfg.val_split_name] = ds[cfg.val_split_name].shuffle().select(range(100))
+        if not cfg.eval_only:
+            ds[cfg.val_split_name] = ds[cfg.val_split_name].shuffle().select(range(500))
+        ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=cfg.num_proc, remove_columns=ds["train"].column_names)
+    collator = DataCollatorForCompletionOnlyLM(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=16,
+        response_template=cfg.response_template_ids
+    )
+    if training_args.process_index == 0:
+        group = os.environ["WANDB_RUN_GROUP"]
+        training_args.hub_model_id = f"nbroad/nbroad-odesia-{group}-{wandb.run.id}"
+        training_args.hub_token = os.environ["HF_WRITE_PERSONAL"]
+    prefix = ""
+    if cfg.eval_only:
+        if "awq" in model_config.model_name_or_path.lower():
+            prefix = "awq_"
+        if model_config.load_in_4bit:
+            prefix += "int4_"
+        elif model_config.torch_dtype == "bfloat16":
+            prefix += "bf16_"
+        elif model_config.torch_dtype == "float16":
+            prefix += "fp16_"
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=ds["train"],
+        eval_dataset=(
+            ds[cfg.val_split_name] if training_args.eval_strategy != "no" else None
+        ),
+        processing_class=tokenizer,
+        data_collator=collator,
+        # compute_metrics=partial(compute_metrics, prefix=prefix),
+    )
+    if training_args.process_index == 0:
+        trainer.model.config.update(
+            {
+                "wandb_id": wandb.run.id,
+                "fold": cfg.fold,
+                "group": group,
+                "dataset": script_args.dataset_name,
+            }
+        )
+    if not cfg.eval_only:
+        if training_args.resume_from_checkpoint is not None:
+            os.chdir(Path(training_args.resume_from_checkpoint).parent)
+        trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+    else:
+        metrics = trainer.evaluate()
+        # if training_args.process_index == 0:
+            # met = [x for x in metrics if "accuracy" in x][0]
+            # result = add_metric_to_card(
+            #     repo=training_args.hub_model_id,
+            #     metrics_pretty_name=met,
+            #     metrics_value=metrics[met],
+            #     dataset_id=script_args.dataset_name,
+            #     dataset_split=cfg.val_split_name,
+            #     model_path=model_config.model_name_or_path,
+            #     model_dtype=model_config.torch_dtype,
+            #     token=os.environ["HF_WRITE_PERSONAL"],
+            # )
+            # print(result)
+    if not cfg.eval_only:
+        # Save and push to hub
+        trainer.save_model(training_args.output_dir)
+        if training_args.push_to_hub:
+            trainer.push_to_hub(
+                dataset_name=script_args.dataset_name,
+                model_name=model_config.model_name_or_path,
+                tags=cfg.hub_repo_tags,
+            )
+    if training_args.process_index == 0:
+        wandb.finish()
+if __name__ == "__main__":
+    main()

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:630794b30c10210f357fe4a09f9c3f5bc93c8d4d863fb3314cce36a75bd7f376
+size 5624

utils.py ADDED Viewed

File without changes

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2025-02-01T23:07:29.707725114Z","level":"INFO","msg":"stream: starting","core version":"0.19.5","symlink path":"/home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug-core.log"}
+{"time":"2025-02-01T23:07:29.811064717Z","level":"INFO","msg":"created new stream","id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811095949Z","level":"INFO","msg":"stream: started","id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.81113275Z","level":"INFO","msg":"writer: Do: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811142254Z","level":"INFO","msg":"sender: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811153678Z","level":"INFO","msg":"handler: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.920262374Z","level":"INFO","msg":"Starting system monitor"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Current SDK version is 0.19.5
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Configure stats pid to 9387
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from /home/ubuntu/.config/wandb/settings
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/settings
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:setup_run_log_directory():637] Logging user logs to /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug.log
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug-internal.log
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():756] calling init triggers
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():761] wandb.init called with sweep_config: {}
+config: {'time_start': '2025-02-01_23-07-29', 'DEBUG': False, 'debug_model': 'unsloth/Qwen2.5-7B-bnb-4bit', 'fold': 0, 'random_seed': True, 'train_on_all_folds': False, 'eval_only': False, 'merge_adapters': False, 'wandb_id': None, 'val_split_name': 'val', 'pad_token': '<pad>', 'response_template_ids': [4], 'num_proc': 20, 'hub_repo_tags': ['odesia'], 'script_args': {'dataset_name': 'nbroad/odesia-combined-v1', 'config': None, 'gradient_checkpointing_use_reentrant': True, 'ignore_bias_buffers': False}, 'model_config': {'model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'torch_dtype': 'bfloat16', 'attn_implementation': 'flash_attention_2', 'use_peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'], 'lora_modules_to_save': None, 'lora_task_type': 'CAUSAL_LM', 'use_rslora': True, 'load_in_8bit': False, 'load_in_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'use_bnb_nested_quant': True}, 'training_args': {'resume_from_checkpoint': None, 'output_dir': './', 'num_train_epochs': 1, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'warmup_ratio': 0.1, 'fp16': False, 'bf16': True, 'eval_strategy': 'steps', 'save_strategy': 'steps', 'eval_steps': 100, 'save_steps': 100, 'save_total_limit': 2, 'logging_steps': 2, 'run_name': None, 'weight_decay': 0.01, 'report_to': 'wandb', 'learning_rate': 6e-05, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'gradient_checkpointing': True, 'gradient_accumulation_steps': 8, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'optim': 'adamw_torch', 'dataloader_num_workers': 4, 'seed': 18, 'max_grad_norm': 2.0, 'load_best_model_at_end': True, 'push_to_hub': True, 'hub_private_repo': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': False, 'ddp_find_unused_parameters': False, 'use_liger_kernel': True}, '_wandb': {}}
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():789] starting backend
+2025-02-01 23:07:29,705 INFO    MainThread:9387 [wandb_init.py:init():793] sending inform_init request
+2025-02-01 23:07:29,706 INFO    MainThread:9387 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-02-01 23:07:29,706 INFO    MainThread:9387 [wandb_init.py:init():808] backend started and connected
+2025-02-01 23:07:29,708 INFO    MainThread:9387 [wandb_init.py:init():901] updated telemetry
+2025-02-01 23:07:29,710 INFO    MainThread:9387 [wandb_init.py:init():926] communicating run to backend with 90.0 second timeout
+2025-02-01 23:07:29,918 INFO    MainThread:9387 [wandb_init.py:init():984] starting run threads in backend
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_console_start():2385] atexit reg
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2235] redirect: wrap_raw
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2300] Wrapping output streams.
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2325] Redirects installed.
+2025-02-01 23:07:30,017 INFO    MainThread:9387 [wandb_init.py:init():1026] run started, returning control to user process
+2025-02-01 23:08:27,289 INFO    MainThread:9387 [wandb_run.py:_config_callback():1253] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'up_proj', 'down_proj', 'k_proj', 'o_proj', 'q_proj', 'gate_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': True, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 131072, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'sliding_window': 32768, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 100000000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', '_attn_implementation_autoset': True, 'transformers_version': '4.48.2', 'model_type': 'mistral', 'wandb_id': 'f0utp5v4', 'fold': 0, 'group': 'clm', 'dataset': 'nbroad/odesia-combined-v1', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 6e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 2.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb01_23-07-29_192-222-56-224', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 18, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 'nbroad/nbroad-odesia-clm-f0utp5v4', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': True, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 1024, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '<CHARS_PER_TOKEN>', 'use_liger': False}
+2025-02-01 23:08:27,294 INFO    MainThread:9387 [wandb_config.py:__setitem__():154] config set model/num_parameters = 8063455232 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0xfedf481b3130>>
+2025-02-01 23:08:27,294 INFO    MainThread:9387 [wandb_run.py:_config_callback():1253] config_cb model/num_parameters 8063455232 None

wandb/run-20250201_230729-f0utp5v4/files/output.log ADDED Viewed

	@@ -0,0 +1,62 @@

+Applied Liger kernels to Qwen2
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████| 4/4 [00:00<00:00,  9.63it/s]
+You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
+Map (num_proc=20): 100%|███████████████████████████████████████████████████| 500/500 [00:03<00:00, 142.98 examples/s]
+/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.
+  warnings.warn(
+The model is not an instance of PreTrainedModel. No liger kernels will be applied.
+[34m[1mwandb[0m: [33mWARNING[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+{'loss': 0.9506, 'grad_norm': 43.904598236083984, 'learning_rate': 2.1428571428571427e-06, 'epoch': 0.0}
+{'loss': 0.7928, 'grad_norm': 28.828706741333008, 'learning_rate': 4.2857142857142855e-06, 'epoch': 0.01}
+{'loss': 0.7947, 'grad_norm': 20.504751205444336, 'learning_rate': 6.428571428571429e-06, 'epoch': 0.01}
+{'loss': 0.4011, 'grad_norm': 10.117717742919922, 'learning_rate': 8.571428571428571e-06, 'epoch': 0.01}
+{'loss': 0.3382, 'grad_norm': 8.685040473937988, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.02}
+{'loss': 0.2912, 'grad_norm': 9.239078521728516, 'learning_rate': 1.2857142857142857e-05, 'epoch': 0.02}
+{'loss': 0.2343, 'grad_norm': 5.444039344787598, 'learning_rate': 1.5e-05, 'epoch': 0.03}
+{'loss': 0.2005, 'grad_norm': 6.025508880615234, 'learning_rate': 1.7142857142857142e-05, 'epoch': 0.03}
+{'loss': 0.1983, 'grad_norm': 4.520172595977783, 'learning_rate': 1.928571428571429e-05, 'epoch': 0.03}
+{'loss': 0.2099, 'grad_norm': 4.423872947692871, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.04}
+{'loss': 0.1928, 'grad_norm': 4.764410018920898, 'learning_rate': 2.357142857142857e-05, 'epoch': 0.04}
+{'loss': 0.1733, 'grad_norm': 3.6303350925445557, 'learning_rate': 2.5714285714285714e-05, 'epoch': 0.04}
+{'loss': 0.2067, 'grad_norm': 3.779996395111084, 'learning_rate': 2.7857142857142858e-05, 'epoch': 0.05}
+{'loss': 0.2261, 'grad_norm': 4.439054012298584, 'learning_rate': 3e-05, 'epoch': 0.05}
+{'loss': 0.2546, 'grad_norm': 4.5176215171813965, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.05}
+{'loss': 0.1637, 'grad_norm': 3.3032355308532715, 'learning_rate': 3.4285714285714284e-05, 'epoch': 0.06}
+{'loss': 0.1875, 'grad_norm': 4.7432169914245605, 'learning_rate': 3.6428571428571423e-05, 'epoch': 0.06}
+{'loss': 0.1727, 'grad_norm': 3.2494094371795654, 'learning_rate': 3.857142857142858e-05, 'epoch': 0.06}
+{'loss': 0.1926, 'grad_norm': 4.482065200805664, 'learning_rate': 4.0714285714285717e-05, 'epoch': 0.07}
+{'loss': 0.1706, 'grad_norm': 2.981718063354492, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.07}
+{'loss': 0.1594, 'grad_norm': 4.96915864944458, 'learning_rate': 4.5e-05, 'epoch': 0.08}
+{'loss': 0.1828, 'grad_norm': 3.0867984294891357, 'learning_rate': 4.714285714285714e-05, 'epoch': 0.08}
+{'loss': 0.1709, 'grad_norm': 3.345247507095337, 'learning_rate': 4.928571428571428e-05, 'epoch': 0.08}
+{'loss': 0.1815, 'grad_norm': 3.01137638092041, 'learning_rate': 5.142857142857143e-05, 'epoch': 0.09}
+{'loss': 0.2335, 'grad_norm': 5.000576972961426, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.09}
+{'loss': 0.193, 'grad_norm': 6.766908168792725, 'learning_rate': 5.5714285714285715e-05, 'epoch': 0.09}
+{'loss': 0.1902, 'grad_norm': 4.348695755004883, 'learning_rate': 5.785714285714286e-05, 'epoch': 0.1}
+{'loss': 0.1889, 'grad_norm': 3.219531774520874, 'learning_rate': 6e-05, 'epoch': 0.1}
+{'loss': 0.1536, 'grad_norm': 3.3314571380615234, 'learning_rate': 5.999763132611449e-05, 'epoch': 0.1}
+{'loss': 0.1715, 'grad_norm': 3.6466634273529053, 'learning_rate': 5.9990525678499e-05, 'epoch': 0.11}
+{'loss': 0.1848, 'grad_norm': 3.5136561393737793, 'learning_rate': 5.9978684179217676e-05, 'epoch': 0.11}
+{'loss': 0.1405, 'grad_norm': 2.3875110149383545, 'learning_rate': 5.996210869818053e-05, 'epoch': 0.12}
+{'loss': 0.1667, 'grad_norm': 2.477057695388794, 'learning_rate': 5.994080185284815e-05, 'epoch': 0.12}
+{'loss': 0.1254, 'grad_norm': 2.3600058555603027, 'learning_rate': 5.991476700781841e-05, 'epoch': 0.12}
+{'loss': 0.2003, 'grad_norm': 3.6588945388793945, 'learning_rate': 5.9884008274295174e-05, 'epoch': 0.13}
+{'loss': 0.1626, 'grad_norm': 2.740492343902588, 'learning_rate': 5.984853050943901e-05, 'epoch': 0.13}
+{'loss': 0.2002, 'grad_norm': 3.283466339111328, 'learning_rate': 5.9808339315600256e-05, 'epoch': 0.13}
+{'loss': 0.1682, 'grad_norm': 2.546633243560791, 'learning_rate': 5.976344103943434e-05, 'epoch': 0.14}
+{'loss': 0.1892, 'grad_norm': 3.356383800506592, 'learning_rate': 5.971384277089953e-05, 'epoch': 0.14}
+{'loss': 0.1502, 'grad_norm': 2.480137348175049, 'learning_rate': 5.965955234213742e-05, 'epoch': 0.14}
+{'loss': 0.1647, 'grad_norm': 3.033668041229248, 'learning_rate': 5.960057832623604e-05, 'epoch': 0.15}
+{'loss': 0.1562, 'grad_norm': 1.9358534812927246, 'learning_rate': 5.9536930035876166e-05, 'epoch': 0.15}
+{'loss': 0.1617, 'grad_norm': 4.796277046203613, 'learning_rate': 5.9468617521860666e-05, 'epoch': 0.15}
+{'loss': 0.1382, 'grad_norm': 2.9329020977020264, 'learning_rate': 5.939565157152741e-05, 'epoch': 0.16}
+{'loss': 0.1484, 'grad_norm': 3.4402196407318115, 'learning_rate': 5.9318043707045806e-05, 'epoch': 0.16}
+{'loss': 0.1837, 'grad_norm': 2.546185255050659, 'learning_rate': 5.923580618359732e-05, 'epoch': 0.17}
+{'loss': 0.1634, 'grad_norm': 2.6801390647888184, 'learning_rate': 5.9148951987440224e-05, 'epoch': 0.17}
+{'loss': 0.1356, 'grad_norm': 2.303766965866089, 'learning_rate': 5.9057494833858934e-05, 'epoch': 0.17}
+{'loss': 0.1862, 'grad_norm': 6.340909957885742, 'learning_rate': 5.896144916499822e-05, 'epoch': 0.18}
+{'loss': 0.1572, 'grad_norm': 5.256030082702637, 'learning_rate': 5.8860830147582575e-05, 'epoch': 0.18}
+{'eval_loss': 0.16956467926502228, 'eval_runtime': 33.4907, 'eval_samples_per_second': 14.93, 'eval_steps_per_second': 1.881, 'epoch': 0.18}

wandb/run-20250201_230729-f0utp5v4/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,228 @@

+aiohttp==3.11.11
+typing_extensions==4.12.2
+antlr4-python3-runtime==4.9.3
+async-timeout==5.0.1
+huggingface-hub==0.28.1
+wandb==0.19.5
+docker-pycreds==0.4.0
+urllib3==2.3.0
+pyarrow==19.0.0
+omegaconf==2.3.0
+pydantic==2.10.6
+python-dotenv==1.0.1
+multiprocess==0.70.16
+aiosignal==1.3.2
+hf_transfer==0.1.9
+datasets==3.2.0
+safetensors==0.5.2
+multidict==6.1.0
+dill==0.3.8
+liger_kernel==0.5.2
+annotated-types==0.7.0
+frozenlist==1.5.0
+sentry-sdk==2.20.0
+yarl==1.18.3
+smmap==5.0.2
+transformers==4.48.2
+GitPython==3.1.44
+hydra-core==1.3.2
+pydantic_core==2.27.2
+tokenizers==0.21.0
+Jinja2==3.1.5
+requests==2.32.3
+setproctitle==1.3.4
+aiohappyeyeballs==2.4.4
+regex==2024.11.6
+gitdb==4.0.12
+charset-normalizer==3.4.1
+xxhash==3.5.0
+tqdm==4.67.1
+trl==0.14.0
+peft==0.14.0
+propcache==0.2.1
+accelerate==1.3.0
+future==0.18.2
+msgpack==1.0.3
+websocket-client==1.2.3
+pexpect==4.8.0
+pyOpenSSL==21.0.0
+protobuf==4.21.12
+PyGObject==3.42.1
+lazr.restfulclient==0.14.4
+lz4==3.1.3+dfsg
+icdiff==2.0.4
+unicodedata2==14.0.0
+fs==2.4.12
+unattended-upgrades==0.1
+py==1.10.0
+torchvision==0.20.1
+jupyter-core==4.9.1
+livereload==2.6.3
+libtmux==0.10.1
+jax==0.4.30
+ufoLib2==0.13.1
+wadllib==1.3.6
+networkx==2.4
+cycler==0.11.0
+configobj==5.0.6
+pipx==1.0.0
+ctop==1.0.0
+jsonpatch==1.32
+ssh-import-id==5.11
+einops==0.8.0
+wrapt==1.13.3
+argcomplete==1.8.1
+decorator==4.4.2
+olefile==0.46
+traitlets==5.1.1
+jeepney==0.7.1
+iotop==0.6
+pystache==0.6.0
+importlib-metadata==4.6.4
+kiwisolver==1.3.2
+distro-info==1.1+ubuntu0.2
+flake8==4.0.1
+nest-asyncio==1.5.4
+wheel==0.37.1
+scipy==1.8.0
+ptyprocess==0.7.0
+distlib==0.3.4
+filelock==3.6.0
+lazr.uri==1.0.6
+zope.interface==5.4.0
+docker==5.0.3
+matplotlib==3.5.1
+python-magic==0.4.24
+fsspec==2024.3.1
+rich==11.2.0
+cloud-init==24.4
+Twisted==22.1.0
+more-itertools==8.10.0
+ipython==7.31.1
+torch==2.5.1
+dbus-python==1.2.18
+tornado==6.1
+bcrypt==3.2.0
+netifaces==0.11.0
+Markdown==3.3.6
+virtualenv==20.13.0+ds
+cryptography==3.4.8
+pyserial==3.5
+flatbuffers==1.12.1-git20200711.33e2d80-dfsg1-0.6
+parso==0.8.1
+fonttools==4.29.1
+tmuxp==1.9.2
+pysmi==0.3.2
+launchpadlib==1.10.16
+appdirs==1.4.4
+mkdocs==1.1.2
+grpcio==1.30.2
+MarkupSafe==2.0.1
+zipp==1.0.0
+pycryptodomex==3.11.0
+jedi==0.18.0
+flash-attn==2.7.0.post2
+html5lib==1.1
+astunparse==1.6.3
+triton==3.1.0
+pyasn1==0.4.8
+pyrsistent==0.18.1
+command-not-found==0.3
+userpath==1.8.0
+ply==3.11
+attrs==21.2.0
+mpmath==0.0.0
+PyJWT==2.3.0
+google-pasta==0.2.0
+Werkzeug==2.0.2
+pyinotify==0.9.6
+PyHamcrest==2.0.2
+Automat==20.2.0
+colorama==0.4.4
+beniget==0.4.1
+influxdb==5.3.1
+h5py==3.6.0
+Pillow==9.0.1
+Jinja2==3.0.3
+Glances==3.2.4.2
+platformdirs==2.5.1
+ufw==0.36.1
+jsonschema==3.2.0
+jaxlib==0.4.30
+pandas==1.3.5
+python-dateutil==2.8.1
+matplotlib-inline==0.1.3
+SecretStorage==3.3.1
+Brotli==1.0.9
+prompt-toolkit==3.0.28
+absl-py==2.1.0
+blinker==1.4
+optree==0.13.1
+numpy==1.21.5
+service-identity==18.1.0
+bottle==0.12.19
+hyperlink==21.0.0
+idna==3.3
+webencodings==0.5.1
+gast==0.5.2
+certifi==2020.6.20
+pyparsing==2.4.7
+PyYAML==5.4.1
+sos==4.7.2
+bleach==4.1.0
+backcall==0.2.0
+urllib3==1.26.5
+incremental==21.3.0
+joblib==0.17.0
+python-apt==2.4.0+ubuntu4
+commonmark==0.9.1
+pythran==0.10.0
+h5py.-debian-h5py-serial==3.6.0
+threadpoolctl==3.1.0
+kaptan==0.5.12
+setuptools==59.6.0
+keras==3.6.0
+mccabe==0.6.1
+jsonpointer==2.0
+packaging==21.3
+cffi==1.15.0
+typing_extensions==4.9.0
+sympy==1.12
+httplib2==0.20.2
+ipykernel==6.7.0
+defusedxml==0.7.1
+ipython_genutils==0.2.0
+Pygments==2.11.2
+lxml==4.8.0
+nvidia-ml-py==12.555.43
+pysnmp==4.4.12
+pycparser==2.21
+keyring==23.5.0
+tensorboard==2.18.0
+psutil==5.9.0
+requests==2.25.1
+six==1.16.0
+termcolor==1.1.0
+pyflakes==2.4.0
+oauthlib==3.2.0
+beautifulsoup4==4.10.0
+entrypoints==0.4
+ml-dtypes==0.5.0
+scikit-learn==0.23.2
+constantly==15.1.0
+pyzmq==22.3.0
+tensorflow==2.18.0
+soupsieve==2.3.1
+chardet==4.0.0
+wcwidth==0.2.5
+pip==22.0.2
+jupyter-client==7.1.2
+pyasn1-modules==0.2.1
+pycodestyle==2.8.0
+pickleshare==0.7.5
+opt-einsum==3.3.0
+click==8.0.3
+distro==1.7.0
+Babel==2.8.0
+namex==0.0.8
+pytz==2022.1

wandb/run-20250201_230729-f0utp5v4/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "os":  "Linux-6.8.0-1013-nvidia-64k-aarch64-with-glibc2.35",
+  "python":  "CPython 3.10.12",
+  "startedAt":  "2025-02-01T23:07:29.707088Z",
+  "args":  [
+    "-cn",
+    "ministral.yaml"
+  ],
+  "program":  "/home/ubuntu/odesia-2025/train/train.py",
+  "codePath":  "train/train.py",
+  "git":  {
+    "remote":  "https://github.com/nbroad1881/odesia-2025.git",
+    "commit":  "fcdb0c7ae3ecfa5c0862a0e47d92f8d6f1a57e89"
+  },
+  "email":  "[email protected]",
+  "root":  "/home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29",
+  "host":  "192-222-56-224",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA GH200 480GB",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "4261618229248",
+      "used":  "51781664768"
+    }
+  },
+  "memory":  {
+    "total":  "564442890240"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GH200 480GB",
+      "memoryTotal":  "102625181696",
+      "cudaCores":  16896,
+      "architecture":  "Hopper"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20250201_230729-f0utp5v4/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,6 @@

+{"time":"2025-02-01T23:07:29.510948818Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpiv5yo5ed/port-9387.txt","pid":9387,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
+{"time":"2025-02-01T23:07:29.512658204Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":9387}
+{"time":"2025-02-01T23:07:29.512650236Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34097,"Zone":""}}
+{"time":"2025-02-01T23:07:29.705144233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:56806"}
+{"time":"2025-02-01T23:07:29.707614937Z","level":"INFO","msg":"handleInformInit: received","streamId":"f0utp5v4","id":"127.0.0.1:56806"}
+{"time":"2025-02-01T23:07:29.811099597Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"f0utp5v4","id":"127.0.0.1:56806"}

wandb/run-20250201_230729-f0utp5v4/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2025-02-01T23:07:29.707725114Z","level":"INFO","msg":"stream: starting","core version":"0.19.5","symlink path":"/home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug-core.log"}
+{"time":"2025-02-01T23:07:29.811064717Z","level":"INFO","msg":"created new stream","id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811095949Z","level":"INFO","msg":"stream: started","id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.81113275Z","level":"INFO","msg":"writer: Do: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811142254Z","level":"INFO","msg":"sender: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.811153678Z","level":"INFO","msg":"handler: started","stream_id":"f0utp5v4"}
+{"time":"2025-02-01T23:07:29.920262374Z","level":"INFO","msg":"Starting system monitor"}

wandb/run-20250201_230729-f0utp5v4/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Current SDK version is 0.19.5
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Configure stats pid to 9387
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from /home/ubuntu/.config/wandb/settings
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/settings
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:setup_run_log_directory():637] Logging user logs to /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug.log
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to /home/ubuntu/odesia-2025/train/outputs/2025-02-01/23-07-29/wandb/run-20250201_230729-f0utp5v4/logs/debug-internal.log
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():756] calling init triggers
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():761] wandb.init called with sweep_config: {}
+config: {'time_start': '2025-02-01_23-07-29', 'DEBUG': False, 'debug_model': 'unsloth/Qwen2.5-7B-bnb-4bit', 'fold': 0, 'random_seed': True, 'train_on_all_folds': False, 'eval_only': False, 'merge_adapters': False, 'wandb_id': None, 'val_split_name': 'val', 'pad_token': '<pad>', 'response_template_ids': [4], 'num_proc': 20, 'hub_repo_tags': ['odesia'], 'script_args': {'dataset_name': 'nbroad/odesia-combined-v1', 'config': None, 'gradient_checkpointing_use_reentrant': True, 'ignore_bias_buffers': False}, 'model_config': {'model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'torch_dtype': 'bfloat16', 'attn_implementation': 'flash_attention_2', 'use_peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'], 'lora_modules_to_save': None, 'lora_task_type': 'CAUSAL_LM', 'use_rslora': True, 'load_in_8bit': False, 'load_in_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'use_bnb_nested_quant': True}, 'training_args': {'resume_from_checkpoint': None, 'output_dir': './', 'num_train_epochs': 1, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'warmup_ratio': 0.1, 'fp16': False, 'bf16': True, 'eval_strategy': 'steps', 'save_strategy': 'steps', 'eval_steps': 100, 'save_steps': 100, 'save_total_limit': 2, 'logging_steps': 2, 'run_name': None, 'weight_decay': 0.01, 'report_to': 'wandb', 'learning_rate': 6e-05, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'gradient_checkpointing': True, 'gradient_accumulation_steps': 8, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'optim': 'adamw_torch', 'dataloader_num_workers': 4, 'seed': 18, 'max_grad_norm': 2.0, 'load_best_model_at_end': True, 'push_to_hub': True, 'hub_private_repo': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': False, 'ddp_find_unused_parameters': False, 'use_liger_kernel': True}, '_wandb': {}}
+2025-02-01 23:07:29,501 INFO    MainThread:9387 [wandb_init.py:init():789] starting backend
+2025-02-01 23:07:29,705 INFO    MainThread:9387 [wandb_init.py:init():793] sending inform_init request
+2025-02-01 23:07:29,706 INFO    MainThread:9387 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-02-01 23:07:29,706 INFO    MainThread:9387 [wandb_init.py:init():808] backend started and connected
+2025-02-01 23:07:29,708 INFO    MainThread:9387 [wandb_init.py:init():901] updated telemetry
+2025-02-01 23:07:29,710 INFO    MainThread:9387 [wandb_init.py:init():926] communicating run to backend with 90.0 second timeout
+2025-02-01 23:07:29,918 INFO    MainThread:9387 [wandb_init.py:init():984] starting run threads in backend
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_console_start():2385] atexit reg
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2235] redirect: wrap_raw
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2300] Wrapping output streams.
+2025-02-01 23:07:30,016 INFO    MainThread:9387 [wandb_run.py:_redirect():2325] Redirects installed.
+2025-02-01 23:07:30,017 INFO    MainThread:9387 [wandb_init.py:init():1026] run started, returning control to user process
+2025-02-01 23:08:27,289 INFO    MainThread:9387 [wandb_run.py:_config_callback():1253] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'up_proj', 'down_proj', 'k_proj', 'o_proj', 'q_proj', 'gate_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': True, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 131072, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'sliding_window': 32768, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 100000000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', '_attn_implementation_autoset': True, 'transformers_version': '4.48.2', 'model_type': 'mistral', 'wandb_id': 'f0utp5v4', 'fold': 0, 'group': 'clm', 'dataset': 'nbroad/odesia-combined-v1', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 6e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 2.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb01_23-07-29_192-222-56-224', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 18, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 'nbroad/nbroad-odesia-clm-f0utp5v4', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': True, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 1024, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '<CHARS_PER_TOKEN>', 'use_liger': False}
+2025-02-01 23:08:27,294 INFO    MainThread:9387 [wandb_config.py:__setitem__():154] config set model/num_parameters = 8063455232 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0xfedf481b3130>>
+2025-02-01 23:08:27,294 INFO    MainThread:9387 [wandb_run.py:_config_callback():1253] config_cb model/num_parameters 8063455232 None

wandb/run-20250201_230729-f0utp5v4/run-f0utp5v4.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb65e22e6313999083f2674dafed0b1b9e0ec57f1fdcf1c51c6f517ea2a1b8a
+size 229376