from transformers import LlamaTokenizerFast
# Load the raw spm model
tokenizer = LlamaTokenizerFast(vocab_file="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm.model")
# Add your special tokens manually to the HF config part
tokenizer.add_special_tokens({
"bos_token": "",
"eos_token": "",
"unk_token": "",
"pad_token": "",
"additional_special_tokens": ["", "", ""]
})
# Save the json version
tokenizer.save_pretrained("Tokenizer/")
print("Converted to tokenizer.json successfully!")