Mini-LLM / data /raw /verify_compression_ratio.py
Ashx098's picture
Upload folder using huggingface_hub
f4e346e verified
raw
history blame contribute delete
401 Bytes
from transformers import PreTrainedTokenizerFast
tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json")
import os
with open("tokenizer/corpus.txt","r") as f:
text = f.read()
num_bytes = len(text.encode("utf-8"))
num_tokens = len(tok.encode(text))
ratio = num_bytes / num_tokens
print("Compression ratio:", ratio)
# Expected ratio is around 3.5 to 4.5 for a good tokenizer