| from transformers import PreTrainedTokenizerFast | |
| tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json") | |
| import os | |
| with open("tokenizer/corpus.txt","r") as f: | |
| text = f.read() | |
| num_bytes = len(text.encode("utf-8")) | |
| num_tokens = len(tok.encode(text)) | |
| ratio = num_bytes / num_tokens | |
| print("Compression ratio:", ratio) | |
| # Expected ratio is around 3.5 to 4.5 for a good tokenizer |