almaghrabima commited on
Commit
6479992
·
verified ·
1 Parent(s): e4a23ce

Honest OOD FineWeb benchmark: lead AR +13.5% vs GPT-4o, lose EN by 6.2%

Browse files
Files changed (1) hide show
  1. fair_benchmark_results.json +170 -0
fair_benchmark_results.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "corpus": {
3
+ "ar_source": "HuggingFaceFW/fineweb-2/arb_Arab",
4
+ "en_source": "HuggingFaceFW/fineweb-edu/sample-10BT",
5
+ "num_samples_per_lang": 300,
6
+ "max_chars": 2000,
7
+ "ar_chars": 400273,
8
+ "en_chars": 507749,
9
+ "ar_words": 68345,
10
+ "en_words": 82573
11
+ },
12
+ "results": [
13
+ {
14
+ "name": "SARFTokenizer v0.2",
15
+ "vocab": 65000,
16
+ "ar_cpt": 3.597183528946564,
17
+ "en_cpt": 4.514207222745782,
18
+ "parity": 0.7968583078821457,
19
+ "ar_tpw": 1.6281220279464481,
20
+ "en_tpw": 1.3621643878749712,
21
+ "ar_tokens": 111274,
22
+ "en_tokens": 112478,
23
+ "encode_time": 0.83054518699646
24
+ },
25
+ {
26
+ "name": "tiktoken/o200k_base",
27
+ "vocab": 200019,
28
+ "ar_cpt": 3.1684714636270086,
29
+ "en_cpt": 4.796329183276341,
30
+ "parity": 0.660603420356283,
31
+ "ar_tpw": 1.8484161240763772,
32
+ "en_tpw": 1.2820413452339143,
33
+ "ar_tokens": 126330,
34
+ "en_tokens": 105862,
35
+ "encode_time": 0.25562620162963867
36
+ },
37
+ {
38
+ "name": "tiktoken/cl100k_base",
39
+ "vocab": 100277,
40
+ "ar_cpt": 1.4619227973805602,
41
+ "en_cpt": 4.741108361734908,
42
+ "parity": 0.30835042902195986,
43
+ "ar_tpw": 4.006130660618918,
44
+ "en_tpw": 1.2969735870078598,
45
+ "ar_tokens": 273799,
46
+ "en_tokens": 107095,
47
+ "encode_time": 0.17181658744812012
48
+ },
49
+ {
50
+ "name": "Qwen/Qwen2.5-0.5B",
51
+ "vocab": 151665,
52
+ "ar_cpt": 2.6675619118705516,
53
+ "en_cpt": 4.649290358025822,
54
+ "parity": 0.5737567900584402,
55
+ "ar_tpw": 2.195508083985661,
56
+ "en_tpw": 1.3225872864011239,
57
+ "ar_tokens": 150052,
58
+ "en_tokens": 109210,
59
+ "encode_time": 0.7926328182220459
60
+ },
61
+ {
62
+ "name": "Qwen/Qwen3.6-35B-A3B",
63
+ "vocab": 248077,
64
+ "ar_cpt": 3.229727112818113,
65
+ "en_cpt": 4.595803803368905,
66
+ "parity": 0.7027556551588638,
67
+ "ar_tpw": 1.8133586948569755,
68
+ "en_tpw": 1.3379797270294165,
69
+ "ar_tokens": 123934,
70
+ "en_tokens": 110481,
71
+ "encode_time": 0.7811288833618164
72
+ },
73
+ {
74
+ "name": "google/gemma-2-2b",
75
+ "vocab": 256000,
76
+ "ar_cpt": 2.8638181571020755,
77
+ "en_cpt": 4.679024291348741,
78
+ "parity": 0.6120545606905948,
79
+ "ar_tpw": 2.0450508449776867,
80
+ "en_tpw": 1.3141826020612064,
81
+ "ar_tokens": 139769,
82
+ "en_tokens": 108516,
83
+ "encode_time": 0.5027248859405518
84
+ },
85
+ {
86
+ "name": "google/gemma-3-1b-pt",
87
+ "vocab": 262145,
88
+ "ar_cpt": 2.9128346565563215,
89
+ "en_cpt": 4.660299948601219,
90
+ "parity": 0.6250315835208426,
91
+ "ar_tpw": 2.010637208281513,
92
+ "en_tpw": 1.3194627783900306,
93
+ "ar_tokens": 137417,
94
+ "en_tokens": 108952,
95
+ "encode_time": 0.49225592613220215
96
+ },
97
+ {
98
+ "name": "google/gemma-4-31B-it",
99
+ "vocab": 262144,
100
+ "ar_cpt": 2.9128346565563215,
101
+ "en_cpt": 4.660299948601219,
102
+ "parity": 0.6250315835208426,
103
+ "ar_tpw": 2.010637208281513,
104
+ "en_tpw": 1.3194627783900306,
105
+ "ar_tokens": 137417,
106
+ "en_tokens": 108952,
107
+ "encode_time": 0.47115373611450195
108
+ },
109
+ {
110
+ "name": "tiiuae/falcon-7b",
111
+ "vocab": 65024,
112
+ "ar_cpt": 1.0156944641805892,
113
+ "en_cpt": 4.558954513620773,
114
+ "parity": 0.22279109413046394,
115
+ "ar_tpw": 5.766156997585778,
116
+ "en_tpw": 1.3487944001065724,
117
+ "ar_tokens": 394088,
118
+ "en_tokens": 111374,
119
+ "encode_time": 1.0502738952636719
120
+ },
121
+ {
122
+ "name": "ALLaM-AI/ALLaM-7B-Instruct-preview",
123
+ "vocab": 64000,
124
+ "ar_cpt": 2.928541117939713,
125
+ "en_cpt": 3.4529473369239976,
126
+ "parity": 0.8481279417798931,
127
+ "ar_tpw": 1.9998536835174483,
128
+ "en_tpw": 1.780824240369128,
129
+ "ar_tokens": 136680,
130
+ "en_tokens": 147048,
131
+ "encode_time": 0.3760392665863037
132
+ },
133
+ {
134
+ "name": "QCRI/Fanar-1-9B-Instruct",
135
+ "vocab": 128256,
136
+ "ar_cpt": 2.861709277053306,
137
+ "en_cpt": 4.564241089487168,
138
+ "parity": 0.6269846883515182,
139
+ "ar_tpw": 2.04655790474797,
140
+ "en_tpw": 1.3472321461010257,
141
+ "ar_tokens": 139872,
142
+ "en_tokens": 111245,
143
+ "encode_time": 0.46768975257873535
144
+ },
145
+ {
146
+ "name": "hammh0a/Hala-350M",
147
+ "vocab": 64400,
148
+ "ar_cpt": 2.2519508284339924,
149
+ "en_cpt": 4.648353962208876,
150
+ "parity": 0.48446199380304417,
151
+ "ar_tpw": 2.6007023191162486,
152
+ "en_tpw": 1.3228537173167985,
153
+ "ar_tokens": 177745,
154
+ "en_tokens": 109232,
155
+ "encode_time": 0.6467485427856445
156
+ },
157
+ {
158
+ "name": "moonshotai/Kimi-K2.6",
159
+ "vocab": 163840,
160
+ "ar_cpt": 2.114814208198826,
161
+ "en_cpt": 4.74310135450724,
162
+ "parity": 0.4458716038587655,
163
+ "ar_tpw": 2.7693466969054064,
164
+ "en_tpw": 1.2964286146803434,
165
+ "ar_tokens": 189271,
166
+ "en_tokens": 107050,
167
+ "encode_time": 3.198082685470581
168
+ }
169
+ ]
170
+ }