Rasmus Lellep commited on
Commit
2370590
·
1 Parent(s): a4a8a33
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. README.md +7 -7
  3. app.py +330 -0
  4. kuidastaltsutadalaamat +1 -0
  5. requirements.txt +8 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Smugri4 1808 Ep3
3
  emoji: 🐢
4
  colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.42.0
 
8
  app_file: app.py
9
  pinned: false
10
- short_description: Smugri translation
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: smugri4-1808-ep3
3
  emoji: 🐢
4
  colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.41.0
8
+ python_version: 3.11
9
  app_file: app.py
10
  pinned: false
11
+ license: other
12
+ short_description: Smugri language detection and translation
13
+ ---
 
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from kuidastaltsutadalaamat.trainllm import load_model, load_tokenizer
4
+ from kuidastaltsutadalaamat.inference import llm_generate
5
+ from kuidastaltsutadalaamat.data import LazyTokenizingInferenceDataset
6
+ from kuidastaltsutadalaamat.promptops import *
7
+
8
+ accel = None
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ model = load_model("mphi/smugri4-1808-ep3-tmptest", device, accelerator=accel, attention="eager") #eager for cpu
11
+ model.eval()
12
+ tokenizer = load_tokenizer("mphi/smugri4-1808-ep3-tmptest", accelerator=accel)
13
+ languages = ["English",
14
+ "English, bible",
15
+ "English, dictionary",
16
+ "English, speech",
17
+ "Erzya",
18
+ "Erzya, bible",
19
+ "Erzya, dictionary",
20
+ "Estonian",
21
+ "Estonian, Amb, dictionary",
22
+ "Estonian, Ann, dictionary",
23
+ "Estonian, Aud, dictionary",
24
+ "Estonian, Hag, dictionary",
25
+ "Estonian, Han, dictionary",
26
+ "Estonian, Hii, emm, dictionary",
27
+ "Estonian, Hii, käi, dictionary",
28
+ "Estonian, Hii, phl, dictionary",
29
+ "Estonian, Hii, rei, dictionary",
30
+ "Estonian, Hiiu, dictionary",
31
+ "Estonian, Hjn, dictionary",
32
+ "Estonian, Hljr, dictionary",
33
+ "Estonian, Hmd, dictionary",
34
+ "Estonian, Hää, dictionary",
35
+ "Estonian, Iisi, dictionary",
36
+ "Estonian, Iisk, dictionary",
37
+ "Estonian, Iisr, dictionary",
38
+ "Estonian, Jjn, dictionary",
39
+ "Estonian, Jmd, dictionary",
40
+ "Estonian, Juu, dictionary",
41
+ "Estonian, Jõer, dictionary",
42
+ "Estonian, Jõh, dictionary",
43
+ "Estonian, Jür, dictionary",
44
+ "Estonian, Kad, dictionary",
45
+ "Estonian, Kihnu, dictionary",
46
+ "Estonian, Kjn, dictionary",
47
+ "Estonian, Kodi, dictionary",
48
+ "Estonian, Kos, dictionary",
49
+ "Estonian, Kse, dictionary",
50
+ "Estonian, Kuuk, dictionary",
51
+ "Estonian, Kuur, dictionary",
52
+ "Estonian, Lng, dictionary",
53
+ "Estonian, Lüg, dictionary",
54
+ "Estonian, Mar, dictionary",
55
+ "Estonian, Mih, dictionary",
56
+ "Estonian, Muhu, dictionary",
57
+ "Estonian, Mul, hel, dictionary",
58
+ "Estonian, Mul, hls, dictionary",
59
+ "Estonian, Mul, krk, dictionary",
60
+ "Estonian, Mul, trv, dictionary",
61
+ "Estonian, Mulgi",
62
+ "Estonian, Mulgi, dictionary",
63
+ "Estonian, Mär, dictionary",
64
+ "Estonian, Pjg, dictionary",
65
+ "Estonian, Plt, dictionary",
66
+ "Estonian, Ranna, dictionary",
67
+ "Estonian, Rap, dictionary",
68
+ "Estonian, Rid, dictionary",
69
+ "Estonian, Saa, dictionary",
70
+ "Estonian, Saa, jäm, dictionary",
71
+ "Estonian, Saa, kaa, dictionary",
72
+ "Estonian, Saa, khk, dictionary",
73
+ "Estonian, Saa, kär, dictionary",
74
+ "Estonian, Saa, mus, dictionary",
75
+ "Estonian, Saa, pha, dictionary",
76
+ "Estonian, Saa, pöi, dictionary",
77
+ "Estonian, Seto",
78
+ "Estonian, Seto, dictionary",
79
+ "Estonian, Trm, dictionary",
80
+ "Estonian, Trt, nõo, dictionary",
81
+ "Estonian, Trt, ote, dictionary",
82
+ "Estonian, Trt, puh, dictionary",
83
+ "Estonian, Trt, ran, dictionary",
84
+ "Estonian, Tõs, dictionary",
85
+ "Estonian, Tür, dictionary",
86
+ "Estonian, Vair, dictionary",
87
+ "Estonian, Var, dictionary",
88
+ "Estonian, Vig, dictionary",
89
+ "Estonian, Vjg, dictionary",
90
+ "Estonian, Vng, dictionary",
91
+ "Estonian, Vän, dictionary",
92
+ "Estonian, bible",
93
+ "Estonian, dictionary",
94
+ "Estonian, Äks, dictionary",
95
+ "Finnish",
96
+ "Finnish, bible",
97
+ "Finnish, dictionary",
98
+ "French",
99
+ "German",
100
+ "Hill Mari",
101
+ "Hill Mari, bible",
102
+ "Hungarian",
103
+ "Hungarian, bible",
104
+ "Inari Sami",
105
+ "Inari Sami, dictionary",
106
+ "Izhorian",
107
+ "Izhorian, Alamaluuga, speech",
108
+ "Izhorian, Mehmet",
109
+ "Izhorian, Soikkola",
110
+ "Izhorian, Soikkola, speech",
111
+ "Izhorian, speech",
112
+ "Kazym Khanty",
113
+ "Kazym Khanty, 2000",
114
+ "Kazym Khanty, 2013",
115
+ "Kildin Sami",
116
+ "Kildin Sami, Orth1",
117
+ "Kildin Sami, Orth1, dictionary",
118
+ "Kildin Sami, Orth2",
119
+ "Kildin Sami, Orth2, dictionary",
120
+ "Komi-Permyak",
121
+ "Komi-Permyak, bible",
122
+ "Komi-Zyrian",
123
+ "Komi-Zyrian, bible",
124
+ "Komi-Zyrian, dictionary",
125
+ "Kven",
126
+ "Kven, dictionary",
127
+ "Latvian",
128
+ "Latvian, bible",
129
+ "Latvian, dictionary",
130
+ "Livonian, Idaliivi, ft",
131
+ "Livonian, Ira, ft",
132
+ "Livonian, Lääneliivi, ft",
133
+ "Livonian, Standard",
134
+ "Livonian, Standard, dictionary",
135
+ "Livvi",
136
+ "Livvi, Impilahti",
137
+ "Livvi, Kondushi",
138
+ "Livvi, Kotkozero",
139
+ "Livvi, Nekkula",
140
+ "Livvi, Newwritten",
141
+ "Livvi, Oldwritten",
142
+ "Livvi, Rypushkalitsa",
143
+ "Livvi, Salmi",
144
+ "Livvi, Syamozero",
145
+ "Livvi, Tulmozero",
146
+ "Livvi, Vedlozero",
147
+ "Livvi, Vidlitsa",
148
+ "Livvi, bible",
149
+ "Ludian",
150
+ "Ludian, Central",
151
+ "Ludian, Miikul",
152
+ "Ludian, Miikul, dictionary",
153
+ "Ludian, Mikhailovskoye",
154
+ "Ludian, Newwritten",
155
+ "Ludian, Northern",
156
+ "Ludian, Southern",
157
+ "Lule Sami",
158
+ "Lule Sami, dictionary",
159
+ "Mansi, Obs",
160
+ "Mansi, Sosv",
161
+ "Mansi, Unk",
162
+ "Mansi, Unk, bible",
163
+ "Mansi, Verh",
164
+ "Meadow Mari",
165
+ "Meadow Mari, bible",
166
+ "Meadow Mari, dictionary",
167
+ "Meänkieli",
168
+ "Moksha",
169
+ "Moksha, bible",
170
+ "Northern Sami",
171
+ "Northern Sami, dictionary",
172
+ "Norwegian",
173
+ "Norwegian, bible",
174
+ "Norwegian, dictionary",
175
+ "Pite Sami",
176
+ "Priur Khanty",
177
+ "Proper Karelian",
178
+ "Proper Karelian, Dyorzha",
179
+ "Proper Karelian, Ilomantsi",
180
+ "Proper Karelian, Keret",
181
+ "Proper Karelian, Kestenga",
182
+ "Proper Karelian, Kontokki",
183
+ "Proper Karelian, Korbiselga",
184
+ "Proper Karelian, Myandyselga",
185
+ "Proper Karelian, Newwritten",
186
+ "Proper Karelian, Newwrittentver",
187
+ "Proper Karelian, Oldwritten",
188
+ "Proper Karelian, Oldwrittentver",
189
+ "Proper Karelian, Oulanga",
190
+ "Proper Karelian, Padany",
191
+ "Proper Karelian, Panozero",
192
+ "Proper Karelian, Poduzhemye",
193
+ "Proper Karelian, Porosozero",
194
+ "Proper Karelian, Reboly",
195
+ "Proper Karelian, Rugozero",
196
+ "Proper Karelian, Suistamo",
197
+ "Proper Karelian, Suoyarvi",
198
+ "Proper Karelian, Suvi",
199
+ "Proper Karelian, Tikhtozero",
200
+ "Proper Karelian, Tikhvin",
201
+ "Proper Karelian, Tolmachi",
202
+ "Proper Karelian, Tunguda",
203
+ "Proper Karelian, Uhta",
204
+ "Proper Karelian, Valdai",
205
+ "Proper Karelian, Vesyegonsk",
206
+ "Proper Karelian, Viena",
207
+ "Proper Karelian, Voknavolok",
208
+ "Proper Karelian, Vychetaibola",
209
+ "Proper Karelian, Yushkozero",
210
+ "Proper Karelian, bible",
211
+ "Russian",
212
+ "Russian, bible",
213
+ "Russian, dictionary",
214
+ "Russian, speech",
215
+ "Shur Khanty",
216
+ "Shur Khanty, 2013",
217
+ "Shur Khanty, bible",
218
+ "Skolt Sami",
219
+ "Skolt Sami, dictionary",
220
+ "Southern Sami",
221
+ "Southern Sami, dictionary",
222
+ "Sred Khanty",
223
+ "Surgut Khanty",
224
+ "Surgut Khanty, 2000",
225
+ "Surgut Khanty, 2013",
226
+ "Swedish",
227
+ "Udmurt",
228
+ "Udmurt, bible",
229
+ "Udmurt, dictionary",
230
+ "Ume Sami",
231
+ "Unk Khanty",
232
+ "Vakh Khanty",
233
+ "Vakh Khanty, 2013",
234
+ "Veps",
235
+ "Veps, Centraleastern",
236
+ "Veps, Centralwestern",
237
+ "Veps, Newwritten",
238
+ "Veps, Northern",
239
+ "Veps, Southern",
240
+ "Veps, bible",
241
+ "Votic, I",
242
+ "Votic, Idavadja, ft",
243
+ "Votic, J",
244
+ "Votic, Ja",
245
+ "Votic, K",
246
+ "Votic, Ke",
247
+ "Votic, Kõ",
248
+ "Votic, L",
249
+ "Votic, Li",
250
+ "Votic, Lu",
251
+ "Votic, Läänevadja, ft",
252
+ "Votic, M",
253
+ "Votic, P",
254
+ "Votic, Po",
255
+ "Votic, R",
256
+ "Votic, Ra",
257
+ "Votic, S",
258
+ "Votic, Standard",
259
+ "Votic, U",
260
+ "Votic, Unk",
261
+ "Votic, Unk, dictionary",
262
+ "Votic, Unk, speech",
263
+ "Votic, V",
264
+ "Votic, dictionary",
265
+ "Võro, Har, dictionary",
266
+ "Võro, Kan, dictionary",
267
+ "Võro, Krl, dictionary",
268
+ "Võro, Lei, dictionary",
269
+ "Võro, Lut, dictionary",
270
+ "Võro, Plv, dictionary",
271
+ "Võro, Räp, dictionary",
272
+ "Võro, Rõu, dictionary",
273
+ "Võro, Setom",
274
+ "Võro, Sõnaq",
275
+ "Võro, Sõnaq, dictionary",
276
+ "Võro, Uma",
277
+ "Võro, Urv, dictionary",
278
+ "Võro, Vas, dictionary",
279
+ "Võro, X"
280
+ ]
281
+
282
+ def run_inference(text, from_lang, to_lang, mode):
283
+ entry = {"src_segm": text, "task": mode}
284
+ if mode == "translate":
285
+ entry.update({"src_lang": from_lang, "tgt_lang": to_lang})
286
+ prompt_format = PF_SMUGRI_MT
287
+ else:
288
+ prompt_format = PF_SMUGRI_LID
289
+
290
+ ds = LazyTokenizingInferenceDataset([entry], tokenizer, prompt_format)
291
+ tok = ds[0]
292
+ output = llm_generate(model, tokenizer, tok, debug=False, max_len=512)
293
+ return output[0]
294
+
295
+ with gr.Blocks() as demo:
296
+ text_input = gr.Textbox(label="Text", lines=6, placeholder="Enter text...")
297
+ identify_btn = gr.Button("Identify language")
298
+ with gr.Row():
299
+ from_dropdown = gr.Dropdown(choices=languages, label="From", value=None)
300
+ to_dropdown = gr.Dropdown(choices=languages, label="To", value=None)
301
+ translate_btn = gr.Button("Translate", interactive=False)
302
+ output = gr.Textbox(label="Output", lines=6)
303
+
304
+ def toggle_identify(text):
305
+ return gr.update(interactive=bool(text.strip()))
306
+ text_input.change(toggle_identify, [text_input], [identify_btn])
307
+
308
+ def toggle_translate(text, f, t):
309
+ return gr.update(interactive=bool(text.strip() and f and t))
310
+ text_input.change(toggle_translate, [text_input, from_dropdown, to_dropdown], [translate_btn])
311
+ from_dropdown.change(toggle_translate, [text_input, from_dropdown, to_dropdown], [translate_btn])
312
+ to_dropdown.change(toggle_translate, [text_input, from_dropdown, to_dropdown], [translate_btn])
313
+
314
+ identify_btn.click(
315
+ fn=lambda text: run_inference(text, None, None, mode="identify"),
316
+ inputs=[text_input],
317
+ outputs=[from_dropdown],
318
+ ).then(
319
+ lambda *args: gr.update(interactive=bool(text_input.value.strip() and from_dropdown.value and to_dropdown.value)),
320
+ [], [translate_btn]
321
+ )
322
+
323
+ translate_btn.click(
324
+ fn=lambda text, f, t: run_inference(text, f, t, mode="translate"),
325
+ inputs=[text_input, from_dropdown, to_dropdown],
326
+ outputs=[output]
327
+ )
328
+
329
+ if __name__ == "__main__":
330
+ demo.launch()
kuidastaltsutadalaamat ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit efc69e6fd102f04dbf87f0245ec8be99d78d463f
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==5.41.0
2
+ torch>=2.1
3
+ accelerate
4
+ transformers
5
+ evaluate
6
+ packaging
7
+ ninja
8
+ #flash_attn