ekshat commited on
Commit
fed424d
·
1 Parent(s): dd6d985

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +28 -28
README.md CHANGED
@@ -41,66 +41,66 @@ print(result[0]['generated_text'])
41
 
42
 
43
  # Model Information
44
- - **model_name = "NousResearch/Llama-2-7b-chat-hf"
45
 
46
- - **dataset_name = "b-mc2/sql-create-context"
47
 
48
 
49
  # QLoRA parameters
50
- - **lora_r = 64
51
 
52
- - **lora_alpha = 16
53
 
54
- - **lora_dropout = 0.1
55
 
56
 
57
  # bitsandbytes parameters
58
- - **use_4bit = True
59
 
60
- - **bnb_4bit_compute_dtype = "float16"
61
 
62
- - **bnb_4bit_quant_type = "nf4"
63
 
64
- - **use_nested_quant = False
65
 
66
 
67
  # TrainingArguments parameters
68
- - **num_train_epochs = 1
69
 
70
- - **fp16 = False
71
 
72
- - **bf16 = False
73
 
74
- - **per_device_train_batch_size = 8
75
 
76
- - **per_device_eval_batch_size = 4
77
 
78
- - **gradient_accumulation_steps = 1
79
 
80
- - **gradient_checkpointing = True
81
 
82
- - **max_grad_norm = 0.3
83
 
84
- - **learning_rate = 2e-4
85
 
86
- - **weight_decay = 0.001
87
 
88
- - **optim = "paged_adamw_32bit"
89
 
90
- - **lr_scheduler_type = "cosine"
91
 
92
- - **max_steps = -1
93
 
94
- - **warmup_ratio = 0.03
95
 
96
- - **group_by_length = True
97
 
98
- - **save_steps = 0
99
 
100
- - **logging_steps = 25
101
 
102
 
103
  # SFT parameters
104
- - **max_seq_length = None
105
 
106
- - **packing = False
 
41
 
42
 
43
  # Model Information
44
+ - **model_name = "NousResearch/Llama-2-7b-chat-hf"**
45
 
46
+ - **dataset_name = "b-mc2/sql-create-context"**
47
 
48
 
49
  # QLoRA parameters
50
+ - **lora_r = 64**
51
 
52
+ - **lora_alpha = 16**
53
 
54
+ - **lora_dropout = 0.1**
55
 
56
 
57
  # bitsandbytes parameters
58
+ - **use_4bit = True**
59
 
60
+ - **bnb_4bit_compute_dtype = "float16"**
61
 
62
+ - **bnb_4bit_quant_type = "nf4"**
63
 
64
+ - **use_nested_quant = False**
65
 
66
 
67
  # TrainingArguments parameters
68
+ - **num_train_epochs = 1**
69
 
70
+ - **fp16 = False**
71
 
72
+ - **bf16 = False**
73
 
74
+ - **per_device_train_batch_size = 8**
75
 
76
+ - **per_device_eval_batch_size = 4**
77
 
78
+ - **gradient_accumulation_steps = 1**
79
 
80
+ - **gradient_checkpointing = True**
81
 
82
+ - **max_grad_norm = 0.3**
83
 
84
+ - **learning_rate = 2e-4**
85
 
86
+ - **weight_decay = 0.001**
87
 
88
+ - **optim = "paged_adamw_32bit"**
89
 
90
+ - **lr_scheduler_type = "cosine"**
91
 
92
+ - **max_steps = -1**
93
 
94
+ - **warmup_ratio = 0.03**
95
 
96
+ - **group_by_length = True**
97
 
98
+ - **save_steps = 0**
99
 
100
+ - **logging_steps = 25**
101
 
102
 
103
  # SFT parameters
104
+ - **max_seq_length = None**
105
 
106
+ - **packing = False**