|
import os |
|
import json |
|
import gradio as gr |
|
import fasttext |
|
from google.cloud import translate_v2 as translate |
|
from transformers import pipeline |
|
from dotenv import load_dotenv |
|
import subprocess |
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
MODEL_PATH = os.path.join(BASE_DIR, "models", "lid.176.bin") |
|
fasttext_model = fasttext.load_model(MODEL_PATH) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
fasttext_model = fasttext.load_model(MODEL_PATH) |
|
except ValueError: |
|
raise RuntimeError("FastText model file could not be loaded.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIAL") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if google_creds_path and os.path.isfile(google_creds_path): |
|
os.environ["GOOGLE_APPLICATION_CREDENTIAL"] = google_creds_path |
|
from google.cloud import translate_v2 as translate |
|
translate_client = translate.Client() |
|
else: |
|
translate_client = None |
|
|
|
|
|
|
|
HF_MODEL_NAME = "papluca/xlm-roberta-base-language-detection" |
|
hf_lang_detector = pipeline("text-classification", model=HF_MODEL_NAME) |
|
|
|
|
|
|
|
LANGUAGE_TO_COUNTRIES = { |
|
"en": ["US", "GB", "CA", "AU", "IN"], |
|
"fr": ["FR", "BE", "CA", "CH", "LU"], |
|
"es": ["ES", "MX", "CO", "AR", "PE"], |
|
"de": ["DE", "AT", "CH", "LU", "BE"], |
|
"ar": ["EG", "SA", "IQ", "DZ", "MA"], |
|
"hi": ["IN", "FJ", "MU", "NP", "SG"], |
|
"zh": ["CN", "SG", "MY", "TW", "HK"], |
|
"ru": ["RU", "BY", "KZ", "UA", "KG"], |
|
"pt": ["PT", "BR", "AO", "MZ", "GW"], |
|
"ja": ["JP"], |
|
"ko": ["KR"], |
|
} |
|
|
|
def flag_emoji(country_code): |
|
return "".join(chr(0x1F1E6 + ord(c) - ord('A')) for c in country_code) |
|
|
|
def render_result(model_name, lang_code, score): |
|
flags = LANGUAGE_TO_COUNTRIES.get(lang_code, []) |
|
if flags: |
|
flag_str = " ".join(flag_emoji(c) for c in flags[:5]) |
|
etc = "<br>...etc" if len(flags) > 5 else "" |
|
else: |
|
flag_str = "🌐" |
|
etc = "" |
|
return f"<b>{model_name}:</b> <code>{lang_code}</code> ({score})<br>{flag_str}{etc}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from langcodes import Language |
|
|
|
|
|
LANG_COUNTRY_MAP = { |
|
'af': ['ZA', 'NA'], |
|
'am': ['ET'], |
|
'ar': ['SA', 'EG', 'IQ', 'MA', 'DZ', 'SD', 'SY', 'YE', 'JO', 'LB', 'TN', 'AE', 'OM', 'KW', 'BH', 'QA', 'LY'], |
|
'az': ['AZ'], |
|
'be': ['BY'], |
|
'bg': ['BG'], |
|
'bn': ['BD', 'IN'], |
|
'bs': ['BA'], |
|
'ca': ['ES', 'AD'], |
|
'ceb': ['PH'], |
|
'cs': ['CZ'], |
|
'cy': ['GB'], |
|
'da': ['DK'], |
|
'de': ['DE', 'AT', 'CH', 'LU', 'BE', 'LI'], |
|
'el': ['GR', 'CY'], |
|
'en': ['US', 'GB', 'CA', 'AU', 'NZ', 'IE', 'ZA', 'IN', 'PH', 'NG', 'KE', 'UG'], |
|
'eo': ['PL', 'FR', 'DE', 'US'], |
|
'es': ['ES', 'MX', 'CO', 'AR', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY'], |
|
'et': ['EE'], |
|
'eu': ['ES', 'FR'], |
|
'fa': ['IR', 'AF', 'TJ'], |
|
'fi': ['FI'], |
|
'fil': ['PH'], |
|
'fj': ['FJ'], |
|
'fr': ['FR', 'BE', 'CA', 'CH', 'LU', 'CI', 'SN', 'ML', 'CM', 'HT', 'MG', 'NE', 'TG', 'GA', 'CD', 'BF', 'TD'], |
|
'fy': ['NL'], |
|
'ga': ['IE'], |
|
'gd': ['GB'], |
|
'gl': ['ES'], |
|
'gu': ['IN'], |
|
'ha': ['NG', 'NE', 'GH'], |
|
'haw': ['US'], |
|
'he': ['IL'], |
|
'hi': ['IN', 'FJ', 'MU', 'NP', 'SG'], |
|
'hmn': ['US'], |
|
'hr': ['HR', 'BA'], |
|
'ht': ['HT'], |
|
'hu': ['HU'], |
|
'hy': ['AM'], |
|
'id': ['ID'], |
|
'ig': ['NG'], |
|
'is': ['IS'], |
|
'it': ['IT', 'CH', 'SM'], |
|
'ja': ['JP'], |
|
'jv': ['ID'], |
|
'ka': ['GE'], |
|
'kk': ['KZ'], |
|
'km': ['KH'], |
|
'kn': ['IN'], |
|
'ko': ['KR', 'KP'], |
|
'ku': ['IQ', 'TR', 'SY', 'IR'], |
|
'ky': ['KG'], |
|
'la': ['VA'], |
|
'lb': ['LU'], |
|
'lo': ['LA'], |
|
'lt': ['LT'], |
|
'lv': ['LV'], |
|
'mg': ['MG'], |
|
'mi': ['NZ'], |
|
'mk': ['MK'], |
|
'ml': ['IN'], |
|
'mn': ['MN'], |
|
'mr': ['IN'], |
|
'ms': ['MY', 'BN', 'SG'], |
|
'mt': ['MT'], |
|
'my': ['MM'], |
|
'ne': ['NP'], |
|
'nl': ['NL', 'BE', 'SR', 'AW', 'CW'], |
|
'no': ['NO'], |
|
'ny': ['MW', 'ZM', 'ZW'], |
|
'pa': ['IN', 'PK'], |
|
'pl': ['PL'], |
|
'ps': ['AF'], |
|
'pt': ['PT', 'BR', 'AO', 'MZ', 'GW', 'ST', 'CV'], |
|
'ro': ['RO', 'MD'], |
|
'ru': ['RU', 'BY', 'KZ', 'KG', 'UA'], |
|
'rw': ['RW'], |
|
'sd': ['PK'], |
|
'si': ['LK'], |
|
'sk': ['SK'], |
|
'sl': ['SI'], |
|
'sm': ['WS'], |
|
'sn': ['ZW'], |
|
'so': ['SO'], |
|
'sq': ['AL', 'XK', 'MK'], |
|
'sr': ['RS', 'BA', 'ME'], |
|
'st': ['LS'], |
|
'su': ['ID'], |
|
'sv': ['SE', 'FI'], |
|
'sw': ['KE', 'TZ', 'UG'], |
|
'ta': ['IN', 'LK', 'SG', 'MY'], |
|
'te': ['IN'], |
|
'tg': ['TJ'], |
|
'th': ['TH'], |
|
'ti': ['ET', 'ER'], |
|
'tk': ['TM'], |
|
'tl': ['PH'], |
|
'tr': ['TR', 'CY'], |
|
'tt': ['RU'], |
|
'ug': ['CN'], |
|
'uk': ['UA'], |
|
'ur': ['PK', 'IN'], |
|
'uz': ['UZ'], |
|
'vi': ['VN'], |
|
'xh': ['ZA'], |
|
'yi': ['US', 'IL'], |
|
'yo': ['NG'], |
|
'zh': ['CN', 'SG', 'MY', 'TW'], |
|
'zu': ['ZA'], |
|
} |
|
|
|
|
|
def country_flag_img(country_code): |
|
|
|
return f"<img src='https://flagcdn.com/w40/{country_code.lower()}.png' title='{LANG_COUNTRY_MAP.get(country_code, country_code)}' height='20' style='margin-right:4px'/><br/>" |
|
|
|
def format_with_flags(lang_code): |
|
countries = LANG_COUNTRY_MAP.get(lang_code, []) |
|
flags_html = ''.join([country_flag_img(c) for c in countries[:5]]) |
|
if len(countries) > 5: |
|
flags_html += "<span style='margin-left:4px;'>etc...</span>" |
|
return flags_html |
|
|
|
def detect_languages(text, hf_model_path=None): |
|
ft_label, ft_score = fasttext_model.predict(text, k=1) |
|
ft_lang = ft_label[0].replace("__label__", "") |
|
ft_score = round(ft_score[0], 3) |
|
|
|
if translate_client: |
|
try: |
|
result = translate_client.detect_language(text) |
|
google_lang = result.get("language", "N/A") |
|
google_conf = round(result.get("confidence", 0), 3) |
|
except Exception: |
|
google_lang = "Error" |
|
google_conf = 0 |
|
else: |
|
google_lang = "Not Configured" |
|
google_conf = 0 |
|
|
|
if hf_model_path and hf_model_path.strip() != "": |
|
try: |
|
custom_detector = pipeline("text-classification", model=hf_model_path) |
|
hf_results = custom_detector(text) |
|
except Exception: |
|
hf_results = [{"label": "Error", "score": 0}] |
|
else: |
|
hf_results = hf_lang_detector(text) |
|
|
|
hf_label = hf_results[0]["label"].lower() |
|
hf_score = round(hf_results[0]["score"], 3) |
|
|
|
return ( |
|
f"FastText: {ft_lang} ({ft_score})<br>{format_with_flags(ft_lang)}", |
|
f"Google API: {google_lang} ({google_conf})<br>{format_with_flags(google_lang)}", |
|
f"HuggingFace: {hf_label} ({hf_score})<br>{format_with_flags(hf_label)}" |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 🌍 Language Detection Comparison") |
|
|
|
with gr.Row(): |
|
input_text = gr.TextArea(label="Enter text", lines=4, placeholder="Type text to detect language...", value="Die Renaissance war eine kulturelle und intellektuelle Bewegung, die im 14. Jahrhundert in Italien begann und sich bis ins 17. Jahrhundert über Europa ausbreitete. Sie markierte eine Wiederbelebung der klassischen Kunst, Literatur und Wissenschaft, die den Humanismus, die wissenschaftliche Forschung und den individuellen Ausdruck betonte. Zu den Schlüsselpersonen gehören Leonardo da Vinci, Michelangelo und Galileo.") |
|
|
|
with gr.Row(): |
|
hf_model_path = gr.Textbox(label="HuggingFace Model Path (optional)", value="papluca/xlm-roberta-base-language-detection", placeholder="e.g. papluca/xlm-roberta-base-language-detection") |
|
|
|
detect_btn = gr.Button("Detect Language") |
|
|
|
with gr.Row(): |
|
fasttext_out = gr.HTML(label="FastText") |
|
google_out = gr.HTML(label="Google") |
|
hf_out = gr.HTML(label="Hugging Face") |
|
|
|
detect_btn.click( |
|
detect_languages, |
|
inputs=[input_text, hf_model_path], |
|
outputs=[fasttext_out, google_out, hf_out] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|