Mini-LLM / Tokenizer /convert_to_hf.py
Ashx098's picture
Upload folder using huggingface_hub
a433a25 verified
raw
history blame contribute delete
553 Bytes
from transformers import LlamaTokenizerFast
# Load the raw spm model
tokenizer = LlamaTokenizerFast(vocab_file="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm.model")
# Add your special tokens manually to the HF config part
tokenizer.add_special_tokens({
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"additional_special_tokens": ["<user>", "<assistant>", "<system>"]
})
# Save the json version
tokenizer.save_pretrained("Tokenizer/")
print("Converted to tokenizer.json successfully!")