Spaces:
Running
Running
George Pantazopoulos
commited on
Commit
·
b60238d
1
Parent(s):
79660f1
feat: update number examples
Browse files- playground_examples.py +53 -5
- playground_tokenizers.py +1 -0
playground_examples.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
-
default_user_input =
|
|
|
|
|
2 |
default_tokenizer_name_1 = "openai/gpt-4o"
|
3 |
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
4 |
|
5 |
|
6 |
-
number_example = """127+677=804
|
7 |
-
127 + 677 = 804
|
8 |
-
|
9 |
-
1275 + 6773 = 8048"""
|
10 |
|
11 |
code_example = """for i in range(1, 101):
|
12 |
if i % 3 == 0 and i % 5 == 0:
|
@@ -23,6 +24,48 @@ spelling_example = """How do you spell "accommodate"?
|
|
23 |
How many letters are in the word "accommodate"?
|
24 |
How many r's are in the word strawberry?"""
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
examples = {
|
27 |
"number": {
|
28 |
"text": number_example,
|
@@ -39,4 +82,9 @@ examples = {
|
|
39 |
"tokenizer_1": default_tokenizer_name_1,
|
40 |
"tokenizer_2": default_tokenizer_name_2,
|
41 |
},
|
|
|
|
|
|
|
|
|
|
|
42 |
}
|
|
|
1 |
+
default_user_input = (
|
2 |
+
"""Replace this text in the input field to see how tokenization works."""
|
3 |
+
)
|
4 |
default_tokenizer_name_1 = "openai/gpt-4o"
|
5 |
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
6 |
|
7 |
|
8 |
+
number_example = """127+677=804\n
|
9 |
+
127 + 677 = 804
|
10 |
+
"""
|
|
|
11 |
|
12 |
code_example = """for i in range(1, 101):
|
13 |
if i % 3 == 0 and i % 5 == 0:
|
|
|
24 |
How many letters are in the word "accommodate"?
|
25 |
How many r's are in the word strawberry?"""
|
26 |
|
27 |
+
|
28 |
+
greek_example = """
|
29 |
+
# Both mean 'I am sorry' though the latter one contains accent mark or stress mark
|
30 |
+
Συγνωμη
|
31 |
+
Συγνώμη
|
32 |
+
|
33 |
+
# Both refer to "bean"
|
34 |
+
Φασόλι
|
35 |
+
Φασούλι
|
36 |
+
|
37 |
+
# Both refer to "Saturday"
|
38 |
+
Σάββατο
|
39 |
+
Σάβατο
|
40 |
+
|
41 |
+
# Both translate to 'egg'
|
42 |
+
Αυγό
|
43 |
+
Αγβό
|
44 |
+
|
45 |
+
# They both translate to grandfather, though the latter is mostly used in Corfu Island
|
46 |
+
Παππούς
|
47 |
+
Πάπους
|
48 |
+
|
49 |
+
# They mean two completely different things!
|
50 |
+
Νόνα # refers to grandmother commonly observed in Ionion pelagos
|
51 |
+
Νονά # refers to godmother in Christianity
|
52 |
+
|
53 |
+
# Both refer to something new
|
54 |
+
καινούριος
|
55 |
+
καινούργιος
|
56 |
+
|
57 |
+
# Both refer to tomato
|
58 |
+
ντοματα
|
59 |
+
τοματα
|
60 |
+
|
61 |
+
τρενο
|
62 |
+
τραινο
|
63 |
+
|
64 |
+
# Singular / Plural versions of something 'innate'
|
65 |
+
εγγενής
|
66 |
+
εγγενείς
|
67 |
+
"""
|
68 |
+
|
69 |
examples = {
|
70 |
"number": {
|
71 |
"text": number_example,
|
|
|
82 |
"tokenizer_1": default_tokenizer_name_1,
|
83 |
"tokenizer_2": default_tokenizer_name_2,
|
84 |
},
|
85 |
+
"greek": {
|
86 |
+
"text": greek_example,
|
87 |
+
"tokenizer_1": default_tokenizer_name_1,
|
88 |
+
"tokenizer_2": "ilsp/Llama-Krikri-8B-Base",
|
89 |
+
},
|
90 |
}
|
playground_tokenizers.py
CHANGED
@@ -96,6 +96,7 @@ tokenizer_configs = [
|
|
96 |
TokenizerConfig("google/mt5-large", org="Google"),
|
97 |
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
98 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
|
|
99 |
]
|
100 |
|
101 |
assert len(set([config.name_display for config in tokenizer_configs])) == len(
|
|
|
96 |
TokenizerConfig("google/mt5-large", org="Google"),
|
97 |
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
98 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
99 |
+
TokenizerConfig("ilsp/Llama-Krikri-8B-Base", org="ILSP"),
|
100 |
]
|
101 |
|
102 |
assert len(set([config.name_display for config in tokenizer_configs])) == len(
|