Update README.md
Browse files
README.md
CHANGED
@@ -89,10 +89,10 @@ Optimized tokenizer for trilingual texts with extended support for Russian vocab
|
|
89 |
| Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 14.5142 | 3.0903 | 16384 |
|
90 |
| yandex/YandexGPT-5-Lite-8B-instruct | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 15.081 | 4.5032 | 1E+030 |
|
91 |
| IlyaGusev/saiga_yandexgpt_8b | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 15.7957 | 3.6403 | 32768 |
|
92 |
-
| loim/
|
93 |
| TinyLlama/TinyLlama-1.1B-Chat-v1.0 | 6655231 | 1.44 | 32000 | 24919 | 77.9 | 4.2 | 1 | 43.1161 | 5.5738 | 2048 |
|
94 |
| ai-forever/ruGPT-3.5-13B | 7154363 | 1.55 | 50257 | 12582 | 25.0 | 3.9 | 0 | 15.711 | 11.2961 | 2048 |
|
95 |
-
| loim/
|
96 |
| ai-forever/rugpt3small_based_on_gpt2 | 7749641 | 1.68 | 50257 | 10938 | 21.8 | 3.6 | 0 | 16.4294 | 8.9582 | 2048 |
|
97 |
|
98 |
### Russian text (16315296 chars, 2185925 words)
|
@@ -104,20 +104,20 @@ Optimized tokenizer for trilingual texts with extended support for Russian vocab
|
|
104 |
| ai-forever/ruGPT-3.5-13B | 3693945 | 1.69 | 50257 | 43208 | 86.0 | 4.4 | 0 | 16.1615 | 3.9659 | 2048 |
|
105 |
| RefalMachine/RuadaptQwen3-32B-Instruct | 3732533 | 1.71 | 146213 | 52564 | 36.0 | 4.4 | 1 | 16.5792 | 2.4271 | 131072 |
|
106 |
| ai-forever/rugpt3small_based_on_gpt2 | 3801887 | 1.74 | 50257 | 42820 | 85.2 | 4.3 | 0 | 17.1418 | 2.9581 | 2048 |
|
107 |
-
| loim/
|
108 |
| deepseek-ai/DeepSeek-V3 | 4806676 | 2.2 | 128000 | 21621 | 16.9 | 3.4 | 1 | 15.8833 | 2.2505 | 131072 |
|
109 |
| IlyaGusev/saiga_nemo_12b | 4926095 | 2.25 | 131072 | 21901 | 16.7 | 3.3 | 1 | 15.2355 | 3.6558 | 1024000 |
|
110 |
| Gensyn/Qwen2.5-1.5B-Instruct | 5411283 | 2.48 | 151643 | 20458 | 13.5 | 3.0 | 1 | 14.6061 | 1.9548 | 131072 |
|
111 |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B | 5411284 | 2.48 | 151643 | 20459 | 13.5 | 3.0 | 1 | 16.4851 | 1.5277 | 16384 |
|
112 |
| TinyLlama/TinyLlama-1.1B-Chat-v1.0 | 5986567 | 2.74 | 32000 | 13454 | 42.0 | 2.7 | 1 | 20.6121 | 1.9489 | 2048 |
|
113 |
-
| loim/
|
114 |
| openai-community/gpt2 | 16931837 | 7.75 | 50257 | 13818 | 27.5 | 1.0 | 1 | 19.4 | 6.16 | 1024 |
|
115 |
| facebook/opt-125m | 16931838 | 7.75 | 50265 | 13819 | 27.5 | 1.0 | 1 | 22.1165 | 4.2726 | 1E+030 |
|
116 |
|
117 |
### Toki pona text (3663780 chars, 831463 words)
|
118 |
| Tokenizer | Tokens | Compression | Vocab Size | Vocab Used | Vocab Usage % | Avg Token Length | Perfect Detokenization | Tokenization Time (s) | Detokenization Time (s) | Max Length |
|
119 |
|---|---|---|---|---|---|---|---|---|---|---|
|
120 |
-
| loim/
|
121 |
| IlyaGusev/saiga_nemo_12b | 1332599 | 1.6 | 131072 | 8428 | 6.4 | 2.7 | 1 | 2.7613 | 0.7956 | 1024000 |
|
122 |
| deepseek-ai/DeepSeek-V3 | 1343359 | 1.62 | 128000 | 8870 | 6.9 | 2.7 | 1 | 2.6998 | 0.4471 | 131072 |
|
123 |
| RefalMachine/RuadaptQwen3-32B-Instruct | 1396348 | 1.68 | 146213 | 7546 | 5.2 | 2.6 | 1 | 2.3745 | 2.2573 | 131072 |
|
@@ -131,6 +131,6 @@ Optimized tokenizer for trilingual texts with extended support for Russian vocab
|
|
131 |
| facebook/opt-125m | 1550847 | 1.87 | 50265 | 6681 | 13.3 | 2.4 | 1 | 2.4144 | 0.6391 | 1E+030 |
|
132 |
| ai-forever/ruGPT-3.5-13B | 1828262 | 2.2 | 50257 | 3881 | 7.7 | 2.0 | 0 | 2.1597 | 0.7194 | 2048 |
|
133 |
| ai-forever/rugpt3small_based_on_gpt2 | 1925501 | 2.32 | 50257 | 3697 | 7.4 | 1.9 | 0 | 1.9954 | 0.8262 | 2048 |
|
134 |
-
| loim/
|
135 |
|
136 |
</details>
|
|
|
89 |
| Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 14.5142 | 3.0903 | 16384 |
|
90 |
| yandex/YandexGPT-5-Lite-8B-instruct | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 15.081 | 4.5032 | 1E+030 |
|
91 |
| IlyaGusev/saiga_yandexgpt_8b | 5984540 | 1.3 | 129024 | 51435 | 39.9 | 4.6 | 1 | 15.7957 | 3.6403 | 32768 |
|
92 |
+
| loim/whiff-tokenizer-12k | 6271746 | 1.36 | 12288 | 9611 | 78.2 | 4.4 | 1 | 41.6606 | 1.5217 | 65536 |
|
93 |
| TinyLlama/TinyLlama-1.1B-Chat-v1.0 | 6655231 | 1.44 | 32000 | 24919 | 77.9 | 4.2 | 1 | 43.1161 | 5.5738 | 2048 |
|
94 |
| ai-forever/ruGPT-3.5-13B | 7154363 | 1.55 | 50257 | 12582 | 25.0 | 3.9 | 0 | 15.711 | 11.2961 | 2048 |
|
95 |
+
| loim/whiff-tokenizer-8k | 7369398 | 1.6 | 8192 | 7456 | 91.0 | 3.8 | 1 | 32.1512 | 1.6195 | 32768 |
|
96 |
| ai-forever/rugpt3small_based_on_gpt2 | 7749641 | 1.68 | 50257 | 10938 | 21.8 | 3.6 | 0 | 16.4294 | 8.9582 | 2048 |
|
97 |
|
98 |
### Russian text (16315296 chars, 2185925 words)
|
|
|
104 |
| ai-forever/ruGPT-3.5-13B | 3693945 | 1.69 | 50257 | 43208 | 86.0 | 4.4 | 0 | 16.1615 | 3.9659 | 2048 |
|
105 |
| RefalMachine/RuadaptQwen3-32B-Instruct | 3732533 | 1.71 | 146213 | 52564 | 36.0 | 4.4 | 1 | 16.5792 | 2.4271 | 131072 |
|
106 |
| ai-forever/rugpt3small_based_on_gpt2 | 3801887 | 1.74 | 50257 | 42820 | 85.2 | 4.3 | 0 | 17.1418 | 2.9581 | 2048 |
|
107 |
+
| loim/whiff-tokenizer-12k | 4070967 | 1.86 | 12288 | 9306 | 75.7 | 4.0 | 1 | 35.0603 | 1.3202 | 65536 |
|
108 |
| deepseek-ai/DeepSeek-V3 | 4806676 | 2.2 | 128000 | 21621 | 16.9 | 3.4 | 1 | 15.8833 | 2.2505 | 131072 |
|
109 |
| IlyaGusev/saiga_nemo_12b | 4926095 | 2.25 | 131072 | 21901 | 16.7 | 3.3 | 1 | 15.2355 | 3.6558 | 1024000 |
|
110 |
| Gensyn/Qwen2.5-1.5B-Instruct | 5411283 | 2.48 | 151643 | 20458 | 13.5 | 3.0 | 1 | 14.6061 | 1.9548 | 131072 |
|
111 |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B | 5411284 | 2.48 | 151643 | 20459 | 13.5 | 3.0 | 1 | 16.4851 | 1.5277 | 16384 |
|
112 |
| TinyLlama/TinyLlama-1.1B-Chat-v1.0 | 5986567 | 2.74 | 32000 | 13454 | 42.0 | 2.7 | 1 | 20.6121 | 1.9489 | 2048 |
|
113 |
+
| loim/whiff-tokenizer-8k | 6090683 | 2.79 | 8192 | 5749 | 70.2 | 2.7 | 1 | 24.6047 | 1.4503 | 32768 |
|
114 |
| openai-community/gpt2 | 16931837 | 7.75 | 50257 | 13818 | 27.5 | 1.0 | 1 | 19.4 | 6.16 | 1024 |
|
115 |
| facebook/opt-125m | 16931838 | 7.75 | 50265 | 13819 | 27.5 | 1.0 | 1 | 22.1165 | 4.2726 | 1E+030 |
|
116 |
|
117 |
### Toki pona text (3663780 chars, 831463 words)
|
118 |
| Tokenizer | Tokens | Compression | Vocab Size | Vocab Used | Vocab Usage % | Avg Token Length | Perfect Detokenization | Tokenization Time (s) | Detokenization Time (s) | Max Length |
|
119 |
|---|---|---|---|---|---|---|---|---|---|---|
|
120 |
+
| loim/whiff-tokenizer-12k | 1144322 | 1.38 | 12288 | 2927 | 23.8 | 3.2 | 1 | 4.145 | 0.2371 | 65536 |
|
121 |
| IlyaGusev/saiga_nemo_12b | 1332599 | 1.6 | 131072 | 8428 | 6.4 | 2.7 | 1 | 2.7613 | 0.7956 | 1024000 |
|
122 |
| deepseek-ai/DeepSeek-V3 | 1343359 | 1.62 | 128000 | 8870 | 6.9 | 2.7 | 1 | 2.6998 | 0.4471 | 131072 |
|
123 |
| RefalMachine/RuadaptQwen3-32B-Instruct | 1396348 | 1.68 | 146213 | 7546 | 5.2 | 2.6 | 1 | 2.3745 | 2.2573 | 131072 |
|
|
|
131 |
| facebook/opt-125m | 1550847 | 1.87 | 50265 | 6681 | 13.3 | 2.4 | 1 | 2.4144 | 0.6391 | 1E+030 |
|
132 |
| ai-forever/ruGPT-3.5-13B | 1828262 | 2.2 | 50257 | 3881 | 7.7 | 2.0 | 0 | 2.1597 | 0.7194 | 2048 |
|
133 |
| ai-forever/rugpt3small_based_on_gpt2 | 1925501 | 2.32 | 50257 | 3697 | 7.4 | 1.9 | 0 | 1.9954 | 0.8262 | 2048 |
|
134 |
+
| loim/whiff-tokenizer-8k | 2123707 | 2.55 | 8192 | 2709 | 33.1 | 1.7 | 1 | 2.4541 | 0.3799 | 32768 |
|
135 |
|
136 |
</details>
|