File size: 2,055 Bytes
b60238d
 
 
33f7995
 
 
 
b60238d
 
 
33f7995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b60238d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33f7995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b60238d
 
 
 
 
33f7995
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
default_user_input = (
    """Replace this text in the input field to see how tokenization works."""
)
default_tokenizer_name_1 = "openai/gpt-4o"
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"


number_example = """127+677=804\n
127 + 677 = 804
"""

code_example = """for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print("FizzBuzz")
    elif i % 3 == 0:
        print("Fizz")
    elif i % 5 == 0:
        print("Buzz")
    else:
        print(i)
"""

spelling_example = """How do you spell "accommodate"?
How many letters are in the word "accommodate"?
How many r's are in the word strawberry?"""


greek_example = """
# Both mean 'I am sorry' though the latter one contains accent mark or stress mark
Συγνωμη
Συγνώμη

# Both refer to "bean"
Φασόλι
Φασούλι

# Both refer to "Saturday"
Σάββατο
Σάβατο

# Both translate to 'egg'
Αυγό
Αγβό

# They both translate to grandfather, though the latter is mostly used in Corfu Island
Παππούς
Πάπους 

# They mean two completely different things! 
Νόνα # refers to grandmother commonly observed in Ionion pelagos
Νονά # refers to godmother in Christianity

# Both refer to something new
καινούριος
καινούργιος

#  Both refer to tomato
ντοματα
τοματα

τρενο
τραινο

# Singular / Plural versions of something 'innate'  
εγγενής
εγγενείς
"""

examples = {
    "number": {
        "text": number_example,
        "tokenizer_1": default_tokenizer_name_1,
        "tokenizer_2": default_tokenizer_name_2,
    },
    "code": {
        "text": code_example,
        "tokenizer_1": default_tokenizer_name_1,
        "tokenizer_2": default_tokenizer_name_2,
    },
    "spelling": {
        "text": spelling_example,
        "tokenizer_1": default_tokenizer_name_1,
        "tokenizer_2": default_tokenizer_name_2,
    },
    "greek": {
        "text": greek_example,
        "tokenizer_1": default_tokenizer_name_1,
        "tokenizer_2": "ilsp/Llama-Krikri-8B-Base",
    },
}