faisalsns commited on
Commit
023e017
·
1 Parent(s): f5cf0c0

Initial commit

Browse files
Files changed (4) hide show
  1. README.md +17 -0
  2. app.py +319 -0
  3. models/lid.176.bin +3 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -12,3 +12,20 @@ short_description: compare language detection models
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+
17
+
18
+ # Language Detection Comparison App
19
+
20
+ This app compares language detection results from three sources:
21
+
22
+ - **Facebook fastText** (offline, accurate)
23
+ - **Google Cloud Translation API** (online, requires API key)
24
+ - **Hugging Face language detection model** (configurable)
25
+
26
+ ## Setup
27
+
28
+ 1. Install dependencies:
29
+
30
+ ```bash
31
+ pip install -r requirements.txt
app.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ import fasttext
5
+ from google.cloud import translate_v2 as translate
6
+ from transformers import pipeline
7
+ from dotenv import load_dotenv
8
+ import subprocess
9
+
10
+
11
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
12
+ MODEL_PATH = os.path.join(BASE_DIR, "models", "lid.176.bin")
13
+ fasttext_model = fasttext.load_model(MODEL_PATH)
14
+
15
+ # model = fasttext.load_model("models\lid.176.bin")
16
+ # print(model.predict("Hello world"))
17
+
18
+ # --- Setup FastText model (download if missing) ---
19
+ # MODEL_PATH = "C:/_Prep/_code/Python/language-detection-compare-models/models/lid.176.bin"
20
+ # os.makedirs("models", exist_ok=True)
21
+ # if not os.path.exists(MODEL_PATH):
22
+ # os.system(
23
+ # f"wget -O {MODEL_PATH} https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
24
+ # )
25
+
26
+ try:
27
+ fasttext_model = fasttext.load_model(MODEL_PATH)
28
+ except ValueError:
29
+ raise RuntimeError("FastText model file could not be loaded.")
30
+
31
+ # --- Setup Google Translate Client ---
32
+ # google_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
33
+ # if google_creds:
34
+ # with open("google_creds.json", "w") as f:
35
+ # f.write(google_creds)
36
+ # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google_creds.json"
37
+ # translate_client = translate.Client()
38
+ # else:
39
+ # translate_client = None
40
+
41
+ #print("Current working directory:", os.getcwd())
42
+ #load_dotenv(dotenv_path=r"C:\_Prep\_code\Python\language-detection-compare-models\.env") # If needed
43
+ #C:\_Prep\_code\Python\language-detection-compare-models\.env
44
+
45
+ google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIAL")
46
+ #print("Resolved GOOGLE_APPLICATION_CREDENTIALS:", google_creds_path)
47
+
48
+ # load_dotenv()
49
+
50
+ # google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
51
+
52
+
53
+ #google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
54
+ if google_creds_path and os.path.isfile(google_creds_path):
55
+ os.environ["GOOGLE_APPLICATION_CREDENTIAL"] = google_creds_path # redundant but explicit
56
+ from google.cloud import translate_v2 as translate
57
+ translate_client = translate.Client()
58
+ else:
59
+ translate_client = None
60
+
61
+
62
+ # --- Setup Hugging Face pipeline ---
63
+ HF_MODEL_NAME = "papluca/xlm-roberta-base-language-detection"
64
+ hf_lang_detector = pipeline("text-classification", model=HF_MODEL_NAME)
65
+
66
+ # --- Mapping ISO 639-1 language codes to countries with flag emojis ---
67
+ # Source: filtered and truncated for top 5 countries (edit as needed)
68
+ LANGUAGE_TO_COUNTRIES = {
69
+ "en": ["US", "GB", "CA", "AU", "IN"],
70
+ "fr": ["FR", "BE", "CA", "CH", "LU"],
71
+ "es": ["ES", "MX", "CO", "AR", "PE"],
72
+ "de": ["DE", "AT", "CH", "LU", "BE"],
73
+ "ar": ["EG", "SA", "IQ", "DZ", "MA"],
74
+ "hi": ["IN", "FJ", "MU", "NP", "SG"],
75
+ "zh": ["CN", "SG", "MY", "TW", "HK"],
76
+ "ru": ["RU", "BY", "KZ", "UA", "KG"],
77
+ "pt": ["PT", "BR", "AO", "MZ", "GW"],
78
+ "ja": ["JP"],
79
+ "ko": ["KR"],
80
+ }
81
+
82
+ def flag_emoji(country_code):
83
+ return "".join(chr(0x1F1E6 + ord(c) - ord('A')) for c in country_code)
84
+
85
+ def render_result(model_name, lang_code, score):
86
+ flags = LANGUAGE_TO_COUNTRIES.get(lang_code, [])
87
+ if flags:
88
+ flag_str = " ".join(flag_emoji(c) for c in flags[:5])
89
+ etc = "<br>...etc" if len(flags) > 5 else ""
90
+ else:
91
+ flag_str = "🌐"
92
+ etc = ""
93
+ return f"<b>{model_name}:</b> <code>{lang_code}</code> ({score})<br>{flag_str}{etc}"
94
+
95
+ # def detect_languages(text, hf_model_path=None):
96
+ # # FastText
97
+ # try:
98
+ # ft_label, ft_score = fasttext_model.predict(text, k=1)
99
+ # ft_lang = ft_label[0].replace("__label__", "")
100
+ # ft_score = round(ft_score[0], 3)
101
+ # except Exception:
102
+ # ft_lang, ft_score = "Error", 0
103
+
104
+ # # Google Translate
105
+ # if translate_client:
106
+ # try:
107
+ # result = translate_client.detect_language(text)
108
+ # google_lang = result.get("language", "N/A")
109
+ # google_conf = round(result.get("confidence", 0), 3)
110
+ # except Exception:
111
+ # google_lang, google_conf = "Error", 0
112
+ # else:
113
+ # google_lang, google_conf = "NotConfigured", 0
114
+
115
+ # # Hugging Face
116
+ # try:
117
+ # model = (
118
+ # pipeline("text-classification", model=hf_model_path)
119
+ # if hf_model_path and hf_model_path.strip()
120
+ # else hf_lang_detector
121
+ # )
122
+ # hf_results = model(text)
123
+ # hf_lang = hf_results[0]["label"].lower()
124
+ # hf_score = round(hf_results[0]["score"], 3)
125
+ # except Exception:
126
+ # hf_lang, hf_score = "Error", 0
127
+
128
+ # return (
129
+ # render_result("FastText", ft_lang, ft_score),
130
+ # render_result("Google", google_lang, google_conf),
131
+ # render_result("HuggingFace", hf_lang, hf_score)
132
+ # )
133
+
134
+ from langcodes import Language
135
+
136
+ # Maps language code to top 5 countries where it's predominantly spoken
137
+ LANG_COUNTRY_MAP = {
138
+ 'af': ['ZA', 'NA'],
139
+ 'am': ['ET'],
140
+ 'ar': ['SA', 'EG', 'IQ', 'MA', 'DZ', 'SD', 'SY', 'YE', 'JO', 'LB', 'TN', 'AE', 'OM', 'KW', 'BH', 'QA', 'LY'],
141
+ 'az': ['AZ'],
142
+ 'be': ['BY'],
143
+ 'bg': ['BG'],
144
+ 'bn': ['BD', 'IN'],
145
+ 'bs': ['BA'],
146
+ 'ca': ['ES', 'AD'],
147
+ 'ceb': ['PH'],
148
+ 'cs': ['CZ'],
149
+ 'cy': ['GB'],
150
+ 'da': ['DK'],
151
+ 'de': ['DE', 'AT', 'CH', 'LU', 'BE', 'LI'],
152
+ 'el': ['GR', 'CY'],
153
+ 'en': ['US', 'GB', 'CA', 'AU', 'NZ', 'IE', 'ZA', 'IN', 'PH', 'NG', 'KE', 'UG'],
154
+ 'eo': ['PL', 'FR', 'DE', 'US'],
155
+ 'es': ['ES', 'MX', 'CO', 'AR', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY'],
156
+ 'et': ['EE'],
157
+ 'eu': ['ES', 'FR'],
158
+ 'fa': ['IR', 'AF', 'TJ'],
159
+ 'fi': ['FI'],
160
+ 'fil': ['PH'],
161
+ 'fj': ['FJ'],
162
+ 'fr': ['FR', 'BE', 'CA', 'CH', 'LU', 'CI', 'SN', 'ML', 'CM', 'HT', 'MG', 'NE', 'TG', 'GA', 'CD', 'BF', 'TD'],
163
+ 'fy': ['NL'],
164
+ 'ga': ['IE'],
165
+ 'gd': ['GB'],
166
+ 'gl': ['ES'],
167
+ 'gu': ['IN'],
168
+ 'ha': ['NG', 'NE', 'GH'],
169
+ 'haw': ['US'],
170
+ 'he': ['IL'],
171
+ 'hi': ['IN', 'FJ', 'MU', 'NP', 'SG'],
172
+ 'hmn': ['US'],
173
+ 'hr': ['HR', 'BA'],
174
+ 'ht': ['HT'],
175
+ 'hu': ['HU'],
176
+ 'hy': ['AM'],
177
+ 'id': ['ID'],
178
+ 'ig': ['NG'],
179
+ 'is': ['IS'],
180
+ 'it': ['IT', 'CH', 'SM'],
181
+ 'ja': ['JP'],
182
+ 'jv': ['ID'],
183
+ 'ka': ['GE'],
184
+ 'kk': ['KZ'],
185
+ 'km': ['KH'],
186
+ 'kn': ['IN'],
187
+ 'ko': ['KR', 'KP'],
188
+ 'ku': ['IQ', 'TR', 'SY', 'IR'],
189
+ 'ky': ['KG'],
190
+ 'la': ['VA'],
191
+ 'lb': ['LU'],
192
+ 'lo': ['LA'],
193
+ 'lt': ['LT'],
194
+ 'lv': ['LV'],
195
+ 'mg': ['MG'],
196
+ 'mi': ['NZ'],
197
+ 'mk': ['MK'],
198
+ 'ml': ['IN'],
199
+ 'mn': ['MN'],
200
+ 'mr': ['IN'],
201
+ 'ms': ['MY', 'BN', 'SG'],
202
+ 'mt': ['MT'],
203
+ 'my': ['MM'],
204
+ 'ne': ['NP'],
205
+ 'nl': ['NL', 'BE', 'SR', 'AW', 'CW'],
206
+ 'no': ['NO'],
207
+ 'ny': ['MW', 'ZM', 'ZW'],
208
+ 'pa': ['IN', 'PK'],
209
+ 'pl': ['PL'],
210
+ 'ps': ['AF'],
211
+ 'pt': ['PT', 'BR', 'AO', 'MZ', 'GW', 'ST', 'CV'],
212
+ 'ro': ['RO', 'MD'],
213
+ 'ru': ['RU', 'BY', 'KZ', 'KG', 'UA'],
214
+ 'rw': ['RW'],
215
+ 'sd': ['PK'],
216
+ 'si': ['LK'],
217
+ 'sk': ['SK'],
218
+ 'sl': ['SI'],
219
+ 'sm': ['WS'],
220
+ 'sn': ['ZW'],
221
+ 'so': ['SO'],
222
+ 'sq': ['AL', 'XK', 'MK'],
223
+ 'sr': ['RS', 'BA', 'ME'],
224
+ 'st': ['LS'],
225
+ 'su': ['ID'],
226
+ 'sv': ['SE', 'FI'],
227
+ 'sw': ['KE', 'TZ', 'UG'],
228
+ 'ta': ['IN', 'LK', 'SG', 'MY'],
229
+ 'te': ['IN'],
230
+ 'tg': ['TJ'],
231
+ 'th': ['TH'],
232
+ 'ti': ['ET', 'ER'],
233
+ 'tk': ['TM'],
234
+ 'tl': ['PH'],
235
+ 'tr': ['TR', 'CY'],
236
+ 'tt': ['RU'],
237
+ 'ug': ['CN'],
238
+ 'uk': ['UA'],
239
+ 'ur': ['PK', 'IN'],
240
+ 'uz': ['UZ'],
241
+ 'vi': ['VN'],
242
+ 'xh': ['ZA'],
243
+ 'yi': ['US', 'IL'],
244
+ 'yo': ['NG'],
245
+ 'zh': ['CN', 'SG', 'MY', 'TW'],
246
+ 'zu': ['ZA'],
247
+ }
248
+
249
+
250
+ def country_flag_img(country_code):
251
+ #return f"<img src='https://flagcdn.com/w40/{country_code.lower()}.png' height='20' style='margin-right:4px'/><br/>"
252
+ return f"<img src='https://flagcdn.com/w40/{country_code.lower()}.png' title='{LANG_COUNTRY_MAP.get(country_code, country_code)}' height='20' style='margin-right:4px'/><br/>"
253
+
254
+ def format_with_flags(lang_code):
255
+ countries = LANG_COUNTRY_MAP.get(lang_code, [])
256
+ flags_html = ''.join([country_flag_img(c) for c in countries[:5]])
257
+ if len(countries) > 5:
258
+ flags_html += "<span style='margin-left:4px;'>etc...</span>"
259
+ return flags_html
260
+
261
+ def detect_languages(text, hf_model_path=None):
262
+ ft_label, ft_score = fasttext_model.predict(text, k=1)
263
+ ft_lang = ft_label[0].replace("__label__", "")
264
+ ft_score = round(ft_score[0], 3)
265
+
266
+ if translate_client:
267
+ try:
268
+ result = translate_client.detect_language(text)
269
+ google_lang = result.get("language", "N/A")
270
+ google_conf = round(result.get("confidence", 0), 3)
271
+ except Exception:
272
+ google_lang = "Error"
273
+ google_conf = 0
274
+ else:
275
+ google_lang = "Not Configured"
276
+ google_conf = 0
277
+
278
+ if hf_model_path and hf_model_path.strip() != "":
279
+ try:
280
+ custom_detector = pipeline("text-classification", model=hf_model_path)
281
+ hf_results = custom_detector(text)
282
+ except Exception:
283
+ hf_results = [{"label": "Error", "score": 0}]
284
+ else:
285
+ hf_results = hf_lang_detector(text)
286
+
287
+ hf_label = hf_results[0]["label"].lower()
288
+ hf_score = round(hf_results[0]["score"], 3)
289
+
290
+ return (
291
+ f"FastText: {ft_lang} ({ft_score})<br>{format_with_flags(ft_lang)}",
292
+ f"Google API: {google_lang} ({google_conf})<br>{format_with_flags(google_lang)}",
293
+ f"HuggingFace: {hf_label} ({hf_score})<br>{format_with_flags(hf_label)}"
294
+ )
295
+
296
+ with gr.Blocks() as demo:
297
+ gr.Markdown("## 🌍 Language Detection Comparison")
298
+
299
+ with gr.Row():
300
+ input_text = gr.TextArea(label="Enter text", lines=4, placeholder="Type text to detect language...")
301
+
302
+ with gr.Row():
303
+ hf_model_path = gr.Textbox(label="HuggingFace Model Path (optional)", value="papluca/xlm-roberta-base-language-detection", placeholder="e.g. papluca/xlm-roberta-base-language-detection")
304
+
305
+ detect_btn = gr.Button("Detect Language")
306
+
307
+ with gr.Row():
308
+ fasttext_out = gr.HTML(label="FastText")
309
+ google_out = gr.HTML(label="Google")
310
+ hf_out = gr.HTML(label="Hugging Face")
311
+
312
+ detect_btn.click(
313
+ detect_languages,
314
+ inputs=[input_text, hf_model_path],
315
+ outputs=[fasttext_out, google_out, hf_out]
316
+ )
317
+
318
+ if __name__ == "__main__":
319
+ demo.launch()
models/lid.176.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
3
+ size 131266198
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ fasttext
3
+ google-cloud-translate
4
+ transformers
5
+ torch