Commit
·
51d42b1
1
Parent(s):
6839937
Upload training_files
Browse files
training_files/alpaca-megaset-fixed.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd16fa0cb1e2402ab5839ec2231ceacf8062070cd750b50b879e74cb16603d3e
|
| 3 |
+
size 30418704
|
training_files/convert-hf-to-pth-16b.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Convert hf to pth
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import LlamaTokenizer, LlamaForCausalLM
|
| 7 |
+
|
| 8 |
+
tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
|
| 9 |
+
|
| 10 |
+
base_model = LlamaForCausalLM.from_pretrained(
|
| 11 |
+
"output_7b",
|
| 12 |
+
load_in_8bit=False,
|
| 13 |
+
torch_dtype=torch.float16,
|
| 14 |
+
device_map={"": "cpu"},
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
base_model_sd = base_model.state_dict()
|
| 18 |
+
|
| 19 |
+
params = {
|
| 20 |
+
"dim": 4096,
|
| 21 |
+
"multiple_of": 256,
|
| 22 |
+
"n_heads": 32,
|
| 23 |
+
"n_layers": 32,
|
| 24 |
+
"norm_eps": 1e-06,
|
| 25 |
+
"vocab_size": -1,
|
| 26 |
+
}
|
| 27 |
+
n_layers = params["n_layers"]
|
| 28 |
+
n_heads = params["n_heads"]
|
| 29 |
+
dim = params["dim"]
|
| 30 |
+
dims_per_head = dim // n_heads
|
| 31 |
+
base = 10000.0
|
| 32 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def permute(w):
|
| 36 |
+
return (
|
| 37 |
+
w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def unpermute(w):
|
| 42 |
+
return (
|
| 43 |
+
w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def translate_state_dict_key(k):
|
| 48 |
+
k = k.replace("base_model.model.", "")
|
| 49 |
+
if k == "model.embed_tokens.weight":
|
| 50 |
+
return "tok_embeddings.weight"
|
| 51 |
+
elif k == "model.norm.weight":
|
| 52 |
+
return "norm.weight"
|
| 53 |
+
elif k == "lm_head.weight":
|
| 54 |
+
return "output.weight"
|
| 55 |
+
elif k.startswith("model.layers."):
|
| 56 |
+
layer = k.split(".")[2]
|
| 57 |
+
if k.endswith(".self_attn.q_proj.weight"):
|
| 58 |
+
return f"layers.{layer}.attention.wq.weight"
|
| 59 |
+
elif k.endswith(".self_attn.k_proj.weight"):
|
| 60 |
+
return f"layers.{layer}.attention.wk.weight"
|
| 61 |
+
elif k.endswith(".self_attn.v_proj.weight"):
|
| 62 |
+
return f"layers.{layer}.attention.wv.weight"
|
| 63 |
+
elif k.endswith(".self_attn.o_proj.weight"):
|
| 64 |
+
return f"layers.{layer}.attention.wo.weight"
|
| 65 |
+
elif k.endswith(".mlp.gate_proj.weight"):
|
| 66 |
+
return f"layers.{layer}.feed_forward.w1.weight"
|
| 67 |
+
elif k.endswith(".mlp.down_proj.weight"):
|
| 68 |
+
return f"layers.{layer}.feed_forward.w2.weight"
|
| 69 |
+
elif k.endswith(".mlp.up_proj.weight"):
|
| 70 |
+
return f"layers.{layer}.feed_forward.w3.weight"
|
| 71 |
+
elif k.endswith(".input_layernorm.weight"):
|
| 72 |
+
return f"layers.{layer}.attention_norm.weight"
|
| 73 |
+
elif k.endswith(".post_attention_layernorm.weight"):
|
| 74 |
+
return f"layers.{layer}.ffn_norm.weight"
|
| 75 |
+
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
|
| 76 |
+
return None
|
| 77 |
+
else:
|
| 78 |
+
print(layer, k)
|
| 79 |
+
raise NotImplementedError
|
| 80 |
+
else:
|
| 81 |
+
print(k)
|
| 82 |
+
raise NotImplementedError
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
new_state_dict = {}
|
| 86 |
+
for k, v in base_model_sd.items():
|
| 87 |
+
new_k = translate_state_dict_key(k)
|
| 88 |
+
if new_k is not None:
|
| 89 |
+
if "wq" in new_k or "wk" in new_k:
|
| 90 |
+
new_state_dict[new_k] = unpermute(v)
|
| 91 |
+
else:
|
| 92 |
+
new_state_dict[new_k] = v
|
| 93 |
+
|
| 94 |
+
torch.save(new_state_dict, "consolidated.00.pth")
|
| 95 |
+
|
| 96 |
+
with open("params.json", "w") as f:
|
| 97 |
+
json.dump(params, f)
|
| 98 |
+
|
| 99 |
+
#Resize tensors
|
| 100 |
+
model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
|
| 101 |
+
x = model["tok_embeddings.weight"]
|
| 102 |
+
y = model["output.weight"]
|
| 103 |
+
row_exclude = 32000
|
| 104 |
+
x = x[:row_exclude]
|
| 105 |
+
y = y[:row_exclude]
|
| 106 |
+
model["tok_embeddings.weight"] = x
|
| 107 |
+
model["output.weight"] = y
|
| 108 |
+
torch.save(model, "consolidated.01.pth")
|
| 109 |
+
#Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth
|
training_files/convert-hf-to-pth-32b.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Convert hf to pth
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import LlamaTokenizer, LlamaForCausalLM
|
| 7 |
+
|
| 8 |
+
tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
|
| 9 |
+
|
| 10 |
+
base_model = LlamaForCausalLM.from_pretrained(
|
| 11 |
+
"output_7b",
|
| 12 |
+
load_in_8bit=False,
|
| 13 |
+
torch_dtype=torch.float16,
|
| 14 |
+
device_map={"": "cpu"},
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
base_model_sd = base_model.state_dict()
|
| 18 |
+
|
| 19 |
+
params = {
|
| 20 |
+
"dim": 4096,
|
| 21 |
+
"multiple_of": 256,
|
| 22 |
+
"n_heads": 32,
|
| 23 |
+
"n_layers": 32,
|
| 24 |
+
"norm_eps": 1e-06,
|
| 25 |
+
"vocab_size": -1,
|
| 26 |
+
}
|
| 27 |
+
n_layers = params["n_layers"]
|
| 28 |
+
n_heads = params["n_heads"]
|
| 29 |
+
dim = params["dim"]
|
| 30 |
+
dims_per_head = dim // n_heads
|
| 31 |
+
base = 10000.0
|
| 32 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def permute(w):
|
| 36 |
+
return (
|
| 37 |
+
w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def unpermute(w):
|
| 42 |
+
return (
|
| 43 |
+
w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def translate_state_dict_key(k):
|
| 48 |
+
k = k.replace("base_model.model.", "")
|
| 49 |
+
if k == "model.embed_tokens.weight":
|
| 50 |
+
return "tok_embeddings.weight"
|
| 51 |
+
elif k == "model.norm.weight":
|
| 52 |
+
return "norm.weight"
|
| 53 |
+
elif k == "lm_head.weight":
|
| 54 |
+
return "output.weight"
|
| 55 |
+
elif k.startswith("model.layers."):
|
| 56 |
+
layer = k.split(".")[2]
|
| 57 |
+
if k.endswith(".self_attn.q_proj.weight"):
|
| 58 |
+
return f"layers.{layer}.attention.wq.weight"
|
| 59 |
+
elif k.endswith(".self_attn.k_proj.weight"):
|
| 60 |
+
return f"layers.{layer}.attention.wk.weight"
|
| 61 |
+
elif k.endswith(".self_attn.v_proj.weight"):
|
| 62 |
+
return f"layers.{layer}.attention.wv.weight"
|
| 63 |
+
elif k.endswith(".self_attn.o_proj.weight"):
|
| 64 |
+
return f"layers.{layer}.attention.wo.weight"
|
| 65 |
+
elif k.endswith(".mlp.gate_proj.weight"):
|
| 66 |
+
return f"layers.{layer}.feed_forward.w1.weight"
|
| 67 |
+
elif k.endswith(".mlp.down_proj.weight"):
|
| 68 |
+
return f"layers.{layer}.feed_forward.w2.weight"
|
| 69 |
+
elif k.endswith(".mlp.up_proj.weight"):
|
| 70 |
+
return f"layers.{layer}.feed_forward.w3.weight"
|
| 71 |
+
elif k.endswith(".input_layernorm.weight"):
|
| 72 |
+
return f"layers.{layer}.attention_norm.weight"
|
| 73 |
+
elif k.endswith(".post_attention_layernorm.weight"):
|
| 74 |
+
return f"layers.{layer}.ffn_norm.weight"
|
| 75 |
+
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
|
| 76 |
+
return None
|
| 77 |
+
else:
|
| 78 |
+
print(layer, k)
|
| 79 |
+
raise NotImplementedError
|
| 80 |
+
else:
|
| 81 |
+
print(k)
|
| 82 |
+
raise NotImplementedError
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
new_state_dict = {}
|
| 86 |
+
for k, v in base_model_sd.items():
|
| 87 |
+
new_k = translate_state_dict_key(k)
|
| 88 |
+
if new_k is not None:
|
| 89 |
+
if "wq" in new_k or "wk" in new_k:
|
| 90 |
+
new_state_dict[new_k] = unpermute(v)
|
| 91 |
+
else:
|
| 92 |
+
new_state_dict[new_k] = v
|
| 93 |
+
|
| 94 |
+
torch.save(new_state_dict, "consolidated.00.pth")
|
| 95 |
+
|
| 96 |
+
with open("params.json", "w") as f:
|
| 97 |
+
json.dump(params, f)
|
training_files/dataset_validator.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
print("This program will validate the JSON training data.")
|
| 4 |
+
|
| 5 |
+
file = input("Enter the file name with extension: ")
|
| 6 |
+
|
| 7 |
+
# Load the JSON file
|
| 8 |
+
with open(file, "r", encoding="utf8") as f:
|
| 9 |
+
data = json.load(f)
|
| 10 |
+
|
| 11 |
+
# Check each item in the JSON file
|
| 12 |
+
for item in data:
|
| 13 |
+
if "instruction" not in item or "input" not in item or "output" not in item:
|
| 14 |
+
print("Error: Missing key in JSON item.")
|
| 15 |
+
print(item)
|
| 16 |
+
|
| 17 |
+
print("File done. ")
|
training_files/full-training-instructions.txt
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
|
| 2 |
+
|
| 3 |
+
bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
|
| 4 |
+
|
| 5 |
+
enter, enter, yes, defaults
|
| 6 |
+
|
| 7 |
+
sudo reboot
|
| 8 |
+
|
| 9 |
+
conda activate
|
| 10 |
+
conda create -n alpaca python=3.10
|
| 11 |
+
conda activate alpaca
|
| 12 |
+
|
| 13 |
+
export PATH="/home/ubuntu/miniconda3/envs/alpaca/bin:$PATH"
|
| 14 |
+
|
| 15 |
+
sudo apt-get install git-lfs
|
| 16 |
+
git lfs install
|
| 17 |
+
|
| 18 |
+
git clone https://github.com/tatsu-lab/stanford_alpaca
|
| 19 |
+
|
| 20 |
+
git clone https://huggingface.co/decapoda-research/llama-7b-hf
|
| 21 |
+
#remember to edit the tokenizer_config.json from LLaMATokenizer to LlamaTokenizer
|
| 22 |
+
|
| 23 |
+
git clone https://huggingface.co/8bit-coder/alpaca-7b-nativeEnhanced
|
| 24 |
+
|
| 25 |
+
pip install sentencepiece
|
| 26 |
+
pip install git+https://github.com/huggingface/transformers.git
|
| 27 |
+
|
| 28 |
+
cd ./stanford_alpaca
|
| 29 |
+
|
| 30 |
+
pip install -r requirements.txt
|
| 31 |
+
|
| 32 |
+
cd ..
|
| 33 |
+
|
| 34 |
+
torchrun --nproc_per_node=8 --master_port=3045 ./stanford_alpaca/train.py --model_name_or_path ./llama-7b-hf --data_path ./alpaca-7b-nativeEnhanced/training_files/alpaca-megaset-fixed.json --fp16 True --output_dir ./output_7b --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 16 --evaluation_strategy "no" --save_strategy "steps" --save_steps 200 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 1 --fsdp "full_shard auto_wrap" --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' --tf32 True
|
| 35 |
+
|
| 36 |
+
# now, make sure with nano that script1.py has proper paths to everything
|
| 37 |
+
|
| 38 |
+
pip install -q datasets loralib sentencepiece
|
| 39 |
+
pip install bitsandbytes
|
| 40 |
+
|
| 41 |
+
python script1.py
|
| 42 |
+
|
| 43 |
+
git clone https://github.com/antimatter15/alpaca.cpp
|
| 44 |
+
|
| 45 |
+
cd alpaca.cpp
|
| 46 |
+
mkdir models
|
| 47 |
+
cd ..
|
| 48 |
+
|
| 49 |
+
mv consolidated.01.pth ./alpaca.cpp/models/consolidated.00.pth
|
| 50 |
+
mv params.json ./alpaca.cpp/models/params.json
|
| 51 |
+
mv output_13b/tokenizer.model ./alpaca.cpp/models/tokenizer.model
|
| 52 |
+
|
| 53 |
+
cd alpaca.cpp
|
| 54 |
+
|
| 55 |
+
make
|
| 56 |
+
|
| 57 |
+
cd ..
|
| 58 |
+
|
| 59 |
+
python .deez/convert-pth-to-ggml.py ./alpaca.cpp/models 2 (1 for 7b, 2 for 13b, and the rest you can check yourself ;)
|
| 60 |
+
|
| 61 |
+
cd alpaca.cpp
|
| 62 |
+
|
| 63 |
+
./quantize models/ggml-model-f16.bin ggml-alpaca-13b-nativeEnhanced-q4.bin 2
|
| 64 |
+
|
| 65 |
+
there's your finished model!
|