from transformers import PreTrainedTokenizerFast tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json") import os with open("tokenizer/corpus.txt","r") as f: text = f.read() num_bytes = len(text.encode("utf-8")) num_tokens = len(tok.encode(text)) ratio = num_bytes / num_tokens print("Compression ratio:", ratio) # Expected ratio is around 3.5 to 4.5 for a good tokenizer