George Pantazopoulos commited on
Commit
b60238d
·
1 Parent(s): 79660f1

feat: update number examples

Browse files
Files changed (2) hide show
  1. playground_examples.py +53 -5
  2. playground_tokenizers.py +1 -0
playground_examples.py CHANGED
@@ -1,12 +1,13 @@
1
- default_user_input = """Replace this text in the input field to see how tokenization works."""
 
 
2
  default_tokenizer_name_1 = "openai/gpt-4o"
3
  default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
4
 
5
 
6
- number_example = """127+677=804
7
- 127 + 677 = 804\n
8
- 1275+6773 = 8041
9
- 1275 + 6773 = 8048"""
10
 
11
  code_example = """for i in range(1, 101):
12
  if i % 3 == 0 and i % 5 == 0:
@@ -23,6 +24,48 @@ spelling_example = """How do you spell "accommodate"?
23
  How many letters are in the word "accommodate"?
24
  How many r's are in the word strawberry?"""
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  examples = {
27
  "number": {
28
  "text": number_example,
@@ -39,4 +82,9 @@ examples = {
39
  "tokenizer_1": default_tokenizer_name_1,
40
  "tokenizer_2": default_tokenizer_name_2,
41
  },
 
 
 
 
 
42
  }
 
1
+ default_user_input = (
2
+ """Replace this text in the input field to see how tokenization works."""
3
+ )
4
  default_tokenizer_name_1 = "openai/gpt-4o"
5
  default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
6
 
7
 
8
+ number_example = """127+677=804\n
9
+ 127 + 677 = 804
10
+ """
 
11
 
12
  code_example = """for i in range(1, 101):
13
  if i % 3 == 0 and i % 5 == 0:
 
24
  How many letters are in the word "accommodate"?
25
  How many r's are in the word strawberry?"""
26
 
27
+
28
+ greek_example = """
29
+ # Both mean 'I am sorry' though the latter one contains accent mark or stress mark
30
+ Συγνωμη
31
+ Συγνώμη
32
+
33
+ # Both refer to "bean"
34
+ Φασόλι
35
+ Φασούλι
36
+
37
+ # Both refer to "Saturday"
38
+ Σάββατο
39
+ Σάβατο
40
+
41
+ # Both translate to 'egg'
42
+ Αυγό
43
+ Αγβό
44
+
45
+ # They both translate to grandfather, though the latter is mostly used in Corfu Island
46
+ Παππούς
47
+ Πάπους
48
+
49
+ # They mean two completely different things!
50
+ Νόνα # refers to grandmother commonly observed in Ionion pelagos
51
+ Νονά # refers to godmother in Christianity
52
+
53
+ # Both refer to something new
54
+ καινούριος
55
+ καινούργιος
56
+
57
+ # Both refer to tomato
58
+ ντοματα
59
+ τοματα
60
+
61
+ τρενο
62
+ τραινο
63
+
64
+ # Singular / Plural versions of something 'innate'
65
+ εγγενής
66
+ εγγενείς
67
+ """
68
+
69
  examples = {
70
  "number": {
71
  "text": number_example,
 
82
  "tokenizer_1": default_tokenizer_name_1,
83
  "tokenizer_2": default_tokenizer_name_2,
84
  },
85
+ "greek": {
86
+ "text": greek_example,
87
+ "tokenizer_1": default_tokenizer_name_1,
88
+ "tokenizer_2": "ilsp/Llama-Krikri-8B-Base",
89
+ },
90
  }
playground_tokenizers.py CHANGED
@@ -96,6 +96,7 @@ tokenizer_configs = [
96
  TokenizerConfig("google/mt5-large", org="Google"),
97
  TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
98
  TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
 
99
  ]
100
 
101
  assert len(set([config.name_display for config in tokenizer_configs])) == len(
 
96
  TokenizerConfig("google/mt5-large", org="Google"),
97
  TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
98
  TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
99
+ TokenizerConfig("ilsp/Llama-Krikri-8B-Base", org="ILSP"),
100
  ]
101
 
102
  assert len(set([config.name_display for config in tokenizer_configs])) == len(