roberturk-base / config.json
Nuri-Tas's picture
Upload model
e32d64f
{
"_name": null,
"architectures": [
"RobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bmuf": {
"_name": null,
"average_sync": false,
"block_lr": 1.0,
"block_momentum": 0.875,
"distributed_world_size": 2,
"global_sync_iter": 50,
"use_nbm": false,
"warmup_iterations": 500
},
"bos_token_id": 0,
"bpe": null,
"checkpoint": {
"_name": null,
"best_checkpoint_metric": "loss",
"checkpoint_shard_count": 1,
"checkpoint_suffix": "",
"continue_once": null,
"finetune_from_model": null,
"keep_best_checkpoints": -1,
"keep_interval_updates": -1,
"keep_interval_updates_pattern": -1,
"keep_last_epochs": -1,
"load_checkpoint_on_all_dp_ranks": false,
"maximize_best_checkpoint_metric": false,
"model_parallel_size": 1,
"no_epoch_checkpoints": true,
"no_last_checkpoints": false,
"no_save": false,
"no_save_optimizer_state": false,
"optimizer_overrides": "{}",
"patience": -1,
"reset_dataloader": false,
"reset_lr_scheduler": false,
"reset_meters": false,
"reset_optimizer": false,
"restore_file": "checkpoint_last.pt",
"save_dir": "roberturk_checkpoints",
"save_interval": 1,
"save_interval_updates": 50000,
"write_checkpoints_asynchronously": false
},
"classifier_dropout": null,
"common": {
"_name": null,
"aim_repo": null,
"aim_run_hash": null,
"all_gather_list_size": 16384,
"amp": false,
"amp_batch_retries": 2,
"amp_init_scale": 128,
"amp_scale_window": null,
"azureml_logging": false,
"bf16": false,
"cpu": false,
"empty_cache_freq": 0,
"fp16": true,
"fp16_init_scale": 128,
"fp16_no_flatten_grads": false,
"fp16_scale_tolerance": 0.0,
"fp16_scale_window": null,
"log_file": null,
"log_format": "json",
"log_interval": 200,
"memory_efficient_bf16": false,
"memory_efficient_fp16": false,
"min_loss_scale": 0.0001,
"model_parallel_size": 1,
"no_progress_bar": false,
"on_cpu_convert_precision": false,
"plasma_path": "/tmp/plasma",
"profile": false,
"quantization_config_path": null,
"reset_logging": false,
"seed": 1,
"suppress_crashes": false,
"tensorboard_logdir": null,
"threshold_loss_scale": null,
"tpu": false,
"use_plasma_view": false,
"user_dir": null,
"wandb_project": null
},
"common_eval": {
"_name": null,
"model_overrides": "{}",
"path": null,
"post_process": null,
"quiet": false,
"results_path": null
},
"criterion": {
"_name": "masked_lm",
"tpu": false
},
"dataset": {
"_name": null,
"batch_size": 16,
"batch_size_valid": 16,
"combine_valid_subsets": null,
"curriculum": 0,
"data_buffer_size": 10,
"dataset_impl": null,
"disable_validation": false,
"fixed_validation_seed": null,
"gen_subset": "test",
"grouped_shuffling": false,
"ignore_unused_valid_subsets": true,
"max_tokens": null,
"max_tokens_valid": null,
"max_valid_steps": null,
"num_shards": 1,
"num_workers": 1,
"required_batch_size_multiple": 8,
"required_seq_len_multiple": 1,
"shard_id": 0,
"skip_invalid_size_inputs_valid_test": true,
"train_subset": "train",
"update_epoch_batch_itr": false,
"update_ordered_indices_seed": false,
"valid_subset": "valid",
"validate_after_updates": 0,
"validate_interval": 1,
"validate_interval_updates": 0
},
"distributed_training": {
"_name": null,
"broadcast_buffers": false,
"bucket_cap_mb": 25,
"cpu_offload": false,
"ddp_backend": "pytorch_ddp",
"ddp_comm_hook": "none",
"device_id": 0,
"distributed_backend": "nccl",
"distributed_init_method": "tcp://localhost:51855",
"distributed_no_spawn": false,
"distributed_num_procs": 2,
"distributed_port": 51855,
"distributed_rank": 0,
"distributed_world_size": 2,
"fast_stat_sync": false,
"find_unused_parameters": false,
"fix_batches_to_gpus": false,
"fp16": true,
"fp32_reduce_scatter": false,
"gradient_as_bucket_view": false,
"heartbeat_timeout": -1,
"localsgd_frequency": 3,
"memory_efficient_fp16": false,
"no_reshard_after_forward": false,
"not_fsdp_flatten_parameters": false,
"nprocs_per_node": 2,
"pipeline_balance": null,
"pipeline_checkpoint": "never",
"pipeline_chunks": 0,
"pipeline_decoder_balance": null,
"pipeline_decoder_devices": null,
"pipeline_devices": null,
"pipeline_encoder_balance": null,
"pipeline_encoder_devices": null,
"pipeline_model_parallel": false,
"slowmo_base_algorithm": "localsgd",
"slowmo_momentum": null,
"tpu": false,
"use_sharded_state": false,
"zero_sharding": "none"
},
"ema": {
"_name": null,
"ema_decay": 0.9999,
"ema_fp32": false,
"ema_seed_model": null,
"ema_start_update": 0,
"ema_update_freq": 1,
"store_ema": false
},
"eos_token_id": 2,
"eval_lm": {
"_name": null,
"context_window": 0,
"output_word_probs": false,
"output_word_stats": false,
"softmax_batch": 9223372036854775807
},
"generation": {
"_name": null,
"beam": 5,
"beam_mt": 0,
"constraints": null,
"decoding_format": null,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"diversity_rate": -1.0,
"eos_token": null,
"iter_decode_eos_penalty": 0.0,
"iter_decode_force_max_iter": false,
"iter_decode_max_iter": 10,
"iter_decode_with_beam": 1,
"iter_decode_with_external_reranker": false,
"lenpen": 1.0,
"lenpen_mt": 1.0,
"lm_path": null,
"lm_weight": 0.0,
"match_source_len": false,
"max_len_a": 0.0,
"max_len_a_mt": 0.0,
"max_len_b": 200,
"max_len_b_mt": 200,
"min_len": 1,
"nbest": 1,
"no_beamable_mm": false,
"no_early_stop": false,
"no_repeat_ngram_size": 0,
"no_seed_provided": false,
"prefix_size": 0,
"print_alignment": null,
"print_step": false,
"replace_unk": null,
"retain_dropout": false,
"retain_dropout_modules": null,
"retain_iter_history": false,
"sacrebleu": false,
"sampling": false,
"sampling_topk": -1,
"sampling_topp": -1.0,
"score_reference": false,
"temperature": 1.0,
"unkpen": 0.0,
"unnormalized": false
},
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"interactive": {
"_name": null,
"buffer_size": 0,
"input": "-"
},
"intermediate_size": 3072,
"job_logging_cfg": {
"disable_existing_loggers": false,
"formatters": {
"simple": {
"format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
}
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"formatter": "simple",
"stream": "ext://sys.stdout"
},
"file": {
"class": "logging.FileHandler",
"filename": "hydra_train.log",
"formatter": "simple"
}
},
"root": {
"handlers": [
"console",
"file"
],
"level": "INFO"
},
"version": 1
},
"layer_norm_eps": 1e-12,
"lr_scheduler": {
"_name": "polynomial_decay",
"end_learning_rate": 0.0,
"force_anneal": null,
"lr": [
1e-05
],
"power": 1.0,
"total_num_update": 10000000.0,
"warmup_updates": 10000
},
"max_position_embeddings": 512,
"model": {
"_name": "roberta",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"adaptive_input": false,
"attention_dropout": 0.1,
"dropout": 0.1,
"encoder_attention_heads": 12,
"encoder_embed_dim": 768,
"encoder_ffn_embed_dim": 3072,
"encoder_layerdrop": 0.0,
"encoder_layers": 12,
"encoder_layers_to_keep": null,
"encoder_learned_pos": true,
"encoder_normalize_before": false,
"layernorm_embedding": true,
"max_positions": 256,
"max_source_positions": 256,
"no_scale_embedding": true,
"no_token_positional_embeddings": false,
"pooler_activation_fn": "tanh",
"pooler_dropout": 0.0,
"quant_noise_pq": 0,
"quant_noise_pq_block_size": 8,
"quant_noise_scalar": 0,
"spectral_norm_classification_head": false,
"untie_weights_roberta": false
},
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"optimization": {
"_name": null,
"clip_norm": 0.0,
"debug_param_names": false,
"lr": [
1e-05
],
"max_epoch": 0,
"max_update": 10000000,
"sentence_avg": false,
"skip_remainder_batch": false,
"stop_min_lr": -1.0,
"stop_time_hours": 0.0,
"update_freq": [
8
],
"use_bmuf": false
},
"optimizer": {
"_name": "adam",
"adam_betas": "(0.9,0.98)",
"adam_eps": 1e-06,
"fp16_adam_stats": false,
"lr": [
1e-05
],
"tpu": false,
"use_old_adam": false,
"weight_decay": 0.01
},
"pad_token_id": 1,
"position_embedding_type": "absolute",
"scoring": null,
"task": {
"_name": "masked_lm",
"d2v2_multi": false,
"data": "data-bin/roberturk_bin",
"freq_weighted_replacement": false,
"include_index": true,
"include_target_tokens": false,
"leave_unmasked_prob": 0.1,
"mask_multiple_length": 1,
"mask_prob": 0.15,
"mask_stdev": 0.0,
"mask_whole_words": false,
"random_token_prob": 0.1,
"sample_break_mode": "complete",
"seed": 1,
"shorten_data_split_list": "",
"shorten_method": "none",
"skip_masking": false,
"tokens_per_sample": 256
},
"tokenizer": null,
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 50265
}