|
|
import os |
|
|
import numpy as np |
|
|
from transformers import AutoTokenizer |
|
|
from tqdm import tqdm |
|
|
|
|
|
def process_data(): |
|
|
|
|
|
input_file_path = "data/raw/merged_text/corpus.txt" |
|
|
tokenizer_path = "Tokenizer/BPE" |
|
|
output_dir = "data/bin" |
|
|
val_split_ratio = 0.1 |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
print(f"Loading tokenizer from {tokenizer_path}...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
|
|
|
|
|
|
|
|
eos_id = tokenizer.eos_token_id |
|
|
print(f"Vocab size: {tokenizer.vocab_size}") |
|
|
print(f"EOS ID: {eos_id}") |
|
|
|
|
|
|
|
|
print(f"Reading {input_file_path}...") |
|
|
with open(input_file_path, 'r', encoding='utf-8') as f: |
|
|
|
|
|
lines = f.readlines() |
|
|
|
|
|
print(f"Total lines: {len(lines):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Tokenizing...") |
|
|
all_tokens = [] |
|
|
|
|
|
|
|
|
for line in tqdm(lines): |
|
|
text = line.strip() |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
tokens = tokenizer.encode(text) |
|
|
tokens.append(eos_id) |
|
|
all_tokens.extend(tokens) |
|
|
|
|
|
token_count = len(all_tokens) |
|
|
print(f"Total tokens: {token_count:,}") |
|
|
|
|
|
|
|
|
|
|
|
ids = np.array(all_tokens, dtype=np.uint16) |
|
|
|
|
|
|
|
|
val_count = int(token_count * val_split_ratio) |
|
|
train_ids = ids[:-val_count] |
|
|
val_ids = ids[-val_count:] |
|
|
|
|
|
print(f"Train tokens: {len(train_ids):,}") |
|
|
print(f"Val tokens: {len(val_ids):,}") |
|
|
|
|
|
|
|
|
train_ids.tofile(os.path.join(output_dir, "train.bin")) |
|
|
val_ids.tofile(os.path.join(output_dir, "val.bin")) |
|
|
|
|
|
print(f"✅ Saved binary files to {output_dir}/") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_data() |
|
|
|