|
{ |
|
"_name": null, |
|
"architectures": [ |
|
"RobertaModel" |
|
], |
|
"attention_probs_dropout_prob": 0.1, |
|
"bmuf": { |
|
"_name": null, |
|
"average_sync": false, |
|
"block_lr": 1.0, |
|
"block_momentum": 0.875, |
|
"distributed_world_size": 2, |
|
"global_sync_iter": 50, |
|
"use_nbm": false, |
|
"warmup_iterations": 500 |
|
}, |
|
"bos_token_id": 0, |
|
"bpe": null, |
|
"checkpoint": { |
|
"_name": null, |
|
"best_checkpoint_metric": "loss", |
|
"checkpoint_shard_count": 1, |
|
"checkpoint_suffix": "", |
|
"continue_once": null, |
|
"finetune_from_model": null, |
|
"keep_best_checkpoints": -1, |
|
"keep_interval_updates": -1, |
|
"keep_interval_updates_pattern": -1, |
|
"keep_last_epochs": -1, |
|
"load_checkpoint_on_all_dp_ranks": false, |
|
"maximize_best_checkpoint_metric": false, |
|
"model_parallel_size": 1, |
|
"no_epoch_checkpoints": true, |
|
"no_last_checkpoints": false, |
|
"no_save": false, |
|
"no_save_optimizer_state": false, |
|
"optimizer_overrides": "{}", |
|
"patience": -1, |
|
"reset_dataloader": false, |
|
"reset_lr_scheduler": false, |
|
"reset_meters": false, |
|
"reset_optimizer": false, |
|
"restore_file": "checkpoint_last.pt", |
|
"save_dir": "roberturk_checkpoints", |
|
"save_interval": 1, |
|
"save_interval_updates": 50000, |
|
"write_checkpoints_asynchronously": false |
|
}, |
|
"classifier_dropout": null, |
|
"common": { |
|
"_name": null, |
|
"aim_repo": null, |
|
"aim_run_hash": null, |
|
"all_gather_list_size": 16384, |
|
"amp": false, |
|
"amp_batch_retries": 2, |
|
"amp_init_scale": 128, |
|
"amp_scale_window": null, |
|
"azureml_logging": false, |
|
"bf16": false, |
|
"cpu": false, |
|
"empty_cache_freq": 0, |
|
"fp16": true, |
|
"fp16_init_scale": 128, |
|
"fp16_no_flatten_grads": false, |
|
"fp16_scale_tolerance": 0.0, |
|
"fp16_scale_window": null, |
|
"log_file": null, |
|
"log_format": "json", |
|
"log_interval": 200, |
|
"memory_efficient_bf16": false, |
|
"memory_efficient_fp16": false, |
|
"min_loss_scale": 0.0001, |
|
"model_parallel_size": 1, |
|
"no_progress_bar": false, |
|
"on_cpu_convert_precision": false, |
|
"plasma_path": "/tmp/plasma", |
|
"profile": false, |
|
"quantization_config_path": null, |
|
"reset_logging": false, |
|
"seed": 1, |
|
"suppress_crashes": false, |
|
"tensorboard_logdir": null, |
|
"threshold_loss_scale": null, |
|
"tpu": false, |
|
"use_plasma_view": false, |
|
"user_dir": null, |
|
"wandb_project": null |
|
}, |
|
"common_eval": { |
|
"_name": null, |
|
"model_overrides": "{}", |
|
"path": null, |
|
"post_process": null, |
|
"quiet": false, |
|
"results_path": null |
|
}, |
|
"criterion": { |
|
"_name": "masked_lm", |
|
"tpu": false |
|
}, |
|
"dataset": { |
|
"_name": null, |
|
"batch_size": 16, |
|
"batch_size_valid": 16, |
|
"combine_valid_subsets": null, |
|
"curriculum": 0, |
|
"data_buffer_size": 10, |
|
"dataset_impl": null, |
|
"disable_validation": false, |
|
"fixed_validation_seed": null, |
|
"gen_subset": "test", |
|
"grouped_shuffling": false, |
|
"ignore_unused_valid_subsets": true, |
|
"max_tokens": null, |
|
"max_tokens_valid": null, |
|
"max_valid_steps": null, |
|
"num_shards": 1, |
|
"num_workers": 1, |
|
"required_batch_size_multiple": 8, |
|
"required_seq_len_multiple": 1, |
|
"shard_id": 0, |
|
"skip_invalid_size_inputs_valid_test": true, |
|
"train_subset": "train", |
|
"update_epoch_batch_itr": false, |
|
"update_ordered_indices_seed": false, |
|
"valid_subset": "valid", |
|
"validate_after_updates": 0, |
|
"validate_interval": 1, |
|
"validate_interval_updates": 0 |
|
}, |
|
"distributed_training": { |
|
"_name": null, |
|
"broadcast_buffers": false, |
|
"bucket_cap_mb": 25, |
|
"cpu_offload": false, |
|
"ddp_backend": "pytorch_ddp", |
|
"ddp_comm_hook": "none", |
|
"device_id": 0, |
|
"distributed_backend": "nccl", |
|
"distributed_init_method": "tcp://localhost:51855", |
|
"distributed_no_spawn": false, |
|
"distributed_num_procs": 2, |
|
"distributed_port": 51855, |
|
"distributed_rank": 0, |
|
"distributed_world_size": 2, |
|
"fast_stat_sync": false, |
|
"find_unused_parameters": false, |
|
"fix_batches_to_gpus": false, |
|
"fp16": true, |
|
"fp32_reduce_scatter": false, |
|
"gradient_as_bucket_view": false, |
|
"heartbeat_timeout": -1, |
|
"localsgd_frequency": 3, |
|
"memory_efficient_fp16": false, |
|
"no_reshard_after_forward": false, |
|
"not_fsdp_flatten_parameters": false, |
|
"nprocs_per_node": 2, |
|
"pipeline_balance": null, |
|
"pipeline_checkpoint": "never", |
|
"pipeline_chunks": 0, |
|
"pipeline_decoder_balance": null, |
|
"pipeline_decoder_devices": null, |
|
"pipeline_devices": null, |
|
"pipeline_encoder_balance": null, |
|
"pipeline_encoder_devices": null, |
|
"pipeline_model_parallel": false, |
|
"slowmo_base_algorithm": "localsgd", |
|
"slowmo_momentum": null, |
|
"tpu": false, |
|
"use_sharded_state": false, |
|
"zero_sharding": "none" |
|
}, |
|
"ema": { |
|
"_name": null, |
|
"ema_decay": 0.9999, |
|
"ema_fp32": false, |
|
"ema_seed_model": null, |
|
"ema_start_update": 0, |
|
"ema_update_freq": 1, |
|
"store_ema": false |
|
}, |
|
"eos_token_id": 2, |
|
"eval_lm": { |
|
"_name": null, |
|
"context_window": 0, |
|
"output_word_probs": false, |
|
"output_word_stats": false, |
|
"softmax_batch": 9223372036854775807 |
|
}, |
|
"generation": { |
|
"_name": null, |
|
"beam": 5, |
|
"beam_mt": 0, |
|
"constraints": null, |
|
"decoding_format": null, |
|
"diverse_beam_groups": -1, |
|
"diverse_beam_strength": 0.5, |
|
"diversity_rate": -1.0, |
|
"eos_token": null, |
|
"iter_decode_eos_penalty": 0.0, |
|
"iter_decode_force_max_iter": false, |
|
"iter_decode_max_iter": 10, |
|
"iter_decode_with_beam": 1, |
|
"iter_decode_with_external_reranker": false, |
|
"lenpen": 1.0, |
|
"lenpen_mt": 1.0, |
|
"lm_path": null, |
|
"lm_weight": 0.0, |
|
"match_source_len": false, |
|
"max_len_a": 0.0, |
|
"max_len_a_mt": 0.0, |
|
"max_len_b": 200, |
|
"max_len_b_mt": 200, |
|
"min_len": 1, |
|
"nbest": 1, |
|
"no_beamable_mm": false, |
|
"no_early_stop": false, |
|
"no_repeat_ngram_size": 0, |
|
"no_seed_provided": false, |
|
"prefix_size": 0, |
|
"print_alignment": null, |
|
"print_step": false, |
|
"replace_unk": null, |
|
"retain_dropout": false, |
|
"retain_dropout_modules": null, |
|
"retain_iter_history": false, |
|
"sacrebleu": false, |
|
"sampling": false, |
|
"sampling_topk": -1, |
|
"sampling_topp": -1.0, |
|
"score_reference": false, |
|
"temperature": 1.0, |
|
"unkpen": 0.0, |
|
"unnormalized": false |
|
}, |
|
"hidden_act": "gelu", |
|
"hidden_dropout_prob": 0.1, |
|
"hidden_size": 768, |
|
"initializer_range": 0.02, |
|
"interactive": { |
|
"_name": null, |
|
"buffer_size": 0, |
|
"input": "-" |
|
}, |
|
"intermediate_size": 3072, |
|
"job_logging_cfg": { |
|
"disable_existing_loggers": false, |
|
"formatters": { |
|
"simple": { |
|
"format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" |
|
} |
|
}, |
|
"handlers": { |
|
"console": { |
|
"class": "logging.StreamHandler", |
|
"formatter": "simple", |
|
"stream": "ext://sys.stdout" |
|
}, |
|
"file": { |
|
"class": "logging.FileHandler", |
|
"filename": "hydra_train.log", |
|
"formatter": "simple" |
|
} |
|
}, |
|
"root": { |
|
"handlers": [ |
|
"console", |
|
"file" |
|
], |
|
"level": "INFO" |
|
}, |
|
"version": 1 |
|
}, |
|
"layer_norm_eps": 1e-12, |
|
"lr_scheduler": { |
|
"_name": "polynomial_decay", |
|
"end_learning_rate": 0.0, |
|
"force_anneal": null, |
|
"lr": [ |
|
1e-05 |
|
], |
|
"power": 1.0, |
|
"total_num_update": 10000000.0, |
|
"warmup_updates": 10000 |
|
}, |
|
"max_position_embeddings": 512, |
|
"model": { |
|
"_name": "roberta", |
|
"activation_dropout": 0.0, |
|
"activation_fn": "gelu", |
|
"adaptive_input": false, |
|
"attention_dropout": 0.1, |
|
"dropout": 0.1, |
|
"encoder_attention_heads": 12, |
|
"encoder_embed_dim": 768, |
|
"encoder_ffn_embed_dim": 3072, |
|
"encoder_layerdrop": 0.0, |
|
"encoder_layers": 12, |
|
"encoder_layers_to_keep": null, |
|
"encoder_learned_pos": true, |
|
"encoder_normalize_before": false, |
|
"layernorm_embedding": true, |
|
"max_positions": 256, |
|
"max_source_positions": 256, |
|
"no_scale_embedding": true, |
|
"no_token_positional_embeddings": false, |
|
"pooler_activation_fn": "tanh", |
|
"pooler_dropout": 0.0, |
|
"quant_noise_pq": 0, |
|
"quant_noise_pq_block_size": 8, |
|
"quant_noise_scalar": 0, |
|
"spectral_norm_classification_head": false, |
|
"untie_weights_roberta": false |
|
}, |
|
"model_type": "roberta", |
|
"num_attention_heads": 12, |
|
"num_hidden_layers": 12, |
|
"optimization": { |
|
"_name": null, |
|
"clip_norm": 0.0, |
|
"debug_param_names": false, |
|
"lr": [ |
|
1e-05 |
|
], |
|
"max_epoch": 0, |
|
"max_update": 10000000, |
|
"sentence_avg": false, |
|
"skip_remainder_batch": false, |
|
"stop_min_lr": -1.0, |
|
"stop_time_hours": 0.0, |
|
"update_freq": [ |
|
8 |
|
], |
|
"use_bmuf": false |
|
}, |
|
"optimizer": { |
|
"_name": "adam", |
|
"adam_betas": "(0.9,0.98)", |
|
"adam_eps": 1e-06, |
|
"fp16_adam_stats": false, |
|
"lr": [ |
|
1e-05 |
|
], |
|
"tpu": false, |
|
"use_old_adam": false, |
|
"weight_decay": 0.01 |
|
}, |
|
"pad_token_id": 1, |
|
"position_embedding_type": "absolute", |
|
"scoring": null, |
|
"task": { |
|
"_name": "masked_lm", |
|
"d2v2_multi": false, |
|
"data": "data-bin/roberturk_bin", |
|
"freq_weighted_replacement": false, |
|
"include_index": true, |
|
"include_target_tokens": false, |
|
"leave_unmasked_prob": 0.1, |
|
"mask_multiple_length": 1, |
|
"mask_prob": 0.15, |
|
"mask_stdev": 0.0, |
|
"mask_whole_words": false, |
|
"random_token_prob": 0.1, |
|
"sample_break_mode": "complete", |
|
"seed": 1, |
|
"shorten_data_split_list": "", |
|
"shorten_method": "none", |
|
"skip_masking": false, |
|
"tokens_per_sample": 256 |
|
}, |
|
"tokenizer": null, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.35.2", |
|
"type_vocab_size": 2, |
|
"use_cache": true, |
|
"vocab_size": 50265 |
|
} |
|
|