File size: 2,257 Bytes
f4e346e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import numpy as np
from transformers import AutoTokenizer
from tqdm import tqdm

def process_data():
    # 1. Config
    input_file_path = "data/raw/merged_text/corpus.txt"  # PATH TO YOUR DATA
    tokenizer_path = "Tokenizer/BPE"                     # PATH TO YOUR NEW TOKENIZER
    output_dir = "data/bin"
    val_split_ratio = 0.1  # 10% for validation

    os.makedirs(output_dir, exist_ok=True)

    # 2. Load Tokenizer
    print(f"Loading tokenizer from {tokenizer_path}...")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    
    # Ensure eos_token is present (usually ID 2)
    eos_id = tokenizer.eos_token_id
    print(f"Vocab size: {tokenizer.vocab_size}")
    print(f"EOS ID: {eos_id}")

    # 3. Read Data
    print(f"Reading {input_file_path}...")
    with open(input_file_path, 'r', encoding='utf-8') as f:
        # Read all lines
        lines = f.readlines()
        
    print(f"Total lines: {len(lines):,}")

    # 4. Tokenize
    # We use a simple list comprehension for the 80M scale. 
    # For 100B scale, we would use parallel processing (multiprocessing).
    print("Tokenizing...")
    all_tokens = []
    
    # Using tqdm for progress bar
    for line in tqdm(lines):
        text = line.strip()
        if not text:
            continue
            
        # Encode text and append EOS token
        # This tells the model where one sentence ends and the next begins
        tokens = tokenizer.encode(text)
        tokens.append(eos_id) 
        all_tokens.extend(tokens)

    token_count = len(all_tokens)
    print(f"Total tokens: {token_count:,}")

    # 5. Convert to Numpy (uint16 saves 50% RAM)
    # 32,000 fits easily in uint16 (max 65,535)
    ids = np.array(all_tokens, dtype=np.uint16)

    # 6. Split Train/Val
    val_count = int(token_count * val_split_ratio)
    train_ids = ids[:-val_count]
    val_ids = ids[-val_count:]

    print(f"Train tokens: {len(train_ids):,}")
    print(f"Val tokens:   {len(val_ids):,}")

    # 7. Save to disk (Memory Mapped friendly)
    train_ids.tofile(os.path.join(output_dir, "train.bin"))
    val_ids.tofile(os.path.join(output_dir, "val.bin"))
    
    print(f"✅ Saved binary files to {output_dir}/")

if __name__ == "__main__":
    process_data()