import gradio as gr import pandas as pd import re import os from simpletransformers.seq2seq import Seq2SeqModel import time # --- 1. MODEL LOADING --- # Load models once at startup to avoid reloading on every request. print("Loading models...") MODELS = { "no_morph": Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="Futyn-Maker/RuthLemm", use_cuda=False ), "morph": Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="Futyn-Maker/RuthLemm-morphology", use_cuda=False ) } print("Models loaded successfully!") # --- 2. PREPROCESSING LOGIC def preprocess_form(form, pos=""): # Handle special punctuation if form in {"(", ")", "[", "]"}: return form # Remove brackets from within words processed_form = form.replace( "(", "").replace( ")", "").replace( "[", "").replace( "]", "") # Apply case rules based on Part-of-Speech (POS) if pos == "PROPN": return processed_form.capitalize() else: return processed_form.lower() # --- 3. CORE FUNCTIONS FOR GRADIO TABS --- def lemmatize_string(raw_text: str): """Lemmatizes a raw string using the non-morphological model.""" if not raw_text.strip(): return "" # Tokenize while preserving punctuation as separate tokens tokens = re.findall(r'\w+|[^\w\s]', raw_text) words_to_predict = [] # We use a special object to mark where words were, to reconstruct the # sentence later placeholder = object() reconstruction_map = [] for token in tokens: if re.match(r'\w+', token): # It's a word # For raw text, we don't have POS tags, so we use the default rule # (lowercase) preprocessed = preprocess_form(token) words_to_predict.append(preprocessed) reconstruction_map.append(placeholder) else: # It's punctuation reconstruction_map.append(token) # Get predictions from the model if there are any words if not words_to_predict: return raw_text predictions = MODELS["no_morph"].predict(words_to_predict) pred_iter = iter(predictions) # Reconstruct the output string output_parts = [] for item in reconstruction_map: if item is placeholder: output_parts.append(next(pred_iter)) else: output_parts.append(item) # Join with spaces, but clean up spacing around punctuation return " ".join(output_parts).replace( " .", ".").replace( " ,", ",").replace( " ?", "?").replace( " !", "!") def lemmatize_conllu( conllu_input_text: str, conllu_file_obj, use_morphology: bool): """Lemmatizes a CoNLL-U formatted text using the selected model.""" # Determine the input source if conllu_file_obj is not None: with open(conllu_file_obj.name, 'r', encoding='utf-8') as f: conllu_text = f.read() else: conllu_text = conllu_input_text if not conllu_text.strip(): return "", None lines = conllu_text.strip().split('\n') inputs_for_model = [] token_lines_indices = [] # Store indices of lines that are actual tokens for i, line in enumerate(lines): if line.startswith('#') or not line.strip(): continue parts = line.split('\t') if len(parts) < 6: # Skip malformed lines continue token_lines_indices.append(i) form = parts[1] pos = parts[3] features = parts[5] preprocessed_form = preprocess_form(form, pos) if use_morphology: model_input = f"{preprocessed_form} {pos} {features}" else: model_input = preprocessed_form inputs_for_model.append(model_input) # If no valid token lines were found, return original text if not inputs_for_model: return conllu_text, None # Select model and predict model = MODELS["morph"] if use_morphology else MODELS["no_morph"] predictions = model.predict(inputs_for_model) # Replace lemma column with predictions pred_iter = iter(predictions) output_lines = list(lines) # Make a mutable copy for line_idx in token_lines_indices: parts = output_lines[line_idx].split('\t') parts[2] = next(pred_iter) # Column 2 is the lemma output_lines[line_idx] = '\t'.join(parts) final_output = "\n".join(output_lines) # Create a file for download timestamp = int(time.time()) output_filename = f"/tmp/lemmatized_{timestamp}.conllu" with open(output_filename, "w", encoding="utf-8") as f: f.write(final_output) return final_output, output_filename # --- 4. GRADIO UI --- # Explanatory text readme_text = """ # RuthLemm Demo This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies. ### How to Use: 1. **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information. 2. **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file. * You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox. * The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file. """ with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(readme_text) with gr.Tabs(): with gr.TabItem("Lemmatize String"): with gr.Row(): string_input = gr.Textbox( lines=8, label="Input Text", placeholder="") string_output = gr.Textbox( lines=8, label="Lemmatized Output", interactive=False) lemmatize_string_btn = gr.Button("Lemmatize", variant="primary") with gr.TabItem("Lemmatize CoNLL-U"): use_morphology_checkbox = gr.Checkbox( label="Use Morphology", value=False, info="Check this to use POS tags and morphological features for better accuracy.") with gr.Row(): with gr.Column(): conllu_input_text = gr.Textbox( lines=10, label="Paste CoNLL-U Data Here", placeholder="") conllu_upload = gr.File( label="Or Upload a .conllu File", file_types=[".conllu"]) with gr.Column(): conllu_output_text = gr.Textbox( lines=10, label="Lemmatized CoNLL-U Output", interactive=False, show_copy_button=True) conllu_download = gr.File( label="Download Result", interactive=False) lemmatize_conllu_btn = gr.Button( "Lemmatize CoNLL-U", variant="primary") # Button click events lemmatize_string_btn.click( fn=lemmatize_string, inputs=[string_input], outputs=[string_output] ) lemmatize_conllu_btn.click( fn=lemmatize_conllu, inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox], outputs=[conllu_output_text, conllu_download] ) if __name__ == "__main__": demo.launch()