File size: 1,154 Bytes
3696887 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pickle
# The paths
tokenizer_dir = "tokenizer_london"
corpus_path = "london_corpus_cleaned_merged.txt"
output_dir = Path("gpt_data_london")
output_dir.mkdir(exist_ok=True)
# loads tokenizer
tokenizer = ByteLevelBPETokenizer(
f"{tokenizer_dir}/vocab.json",
f"{tokenizer_dir}/merges.txt"
)
with open(corpus_path, "r", encoding="utf-8") as f:
data = f.read()
print(" Encoding text...")
ids = tokenizer.encode(data).ids
ids = np.array(ids, dtype=np.uint16)
# Split into train/val
split = int(0.9 * len(ids))
train_ids, val_ids = ids[:split], ids[split:]
# .bin
train_ids.tofile(output_dir / "train.bin")
val_ids.tofile(output_dir / "val.bin")
# Save metadata for decoding later
meta = {
"vocab_size": tokenizer.get_vocab_size(),
"tokenizer_config": {
"vocab_file": f"{tokenizer_dir}/vocab.json",
"merges_file": f"{tokenizer_dir}/merges.txt"
}
}
with open(output_dir / "meta.pkl", "wb") as f:
pickle.dump(meta, f)
print("Saved train.bin, val.bin and meta.pkl in:", output_dir)
|