Spaces:
Running
Running
| import streamlit as st | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import pandas as pd | |
| from fpdf import FPDF | |
| # Interface utilisateur | |
| st.set_page_config( | |
| page_title="Traduction d'une phrase en pictogrammes ARASAAC", | |
| page_icon="📝", | |
| layout="wide" | |
| ) | |
| # Charger le modèle et le tokenizer | |
| # checkpoint = "Propicto/t2p-t5-large-orfeo" | |
| checkpoint = "Propicto/t2p-nllb-200-distilled-600M-all" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
| # Lire le lexique | |
| def read_lexicon(lexicon): | |
| df = pd.read_csv(lexicon, sep='\t') | |
| df['keyword_no_cat'] = df['lemma'].str.split(' #').str[0].str.strip().str.replace(' ', '_') | |
| return df | |
| lexicon = read_lexicon("lexicon.csv") | |
| # Processus de sortie de la traduction | |
| def process_output_trad(pred): | |
| return pred.split() | |
| def get_id_picto_from_predicted_lemma(df_lexicon, lemma): | |
| if lemma.endswith("!"): | |
| lemma = lemma[:-1] | |
| id_picto = df_lexicon.loc[df_lexicon['keyword_no_cat'] == lemma, 'id_picto'].tolist() | |
| return (id_picto[0], lemma) if id_picto else (0, lemma) | |
| # Génération du contenu HTML pour afficher les pictogrammes | |
| def generate_html(ids): | |
| html_content = '<html><head><style>' | |
| html_content += ''' | |
| figure { | |
| display: inline-block; | |
| text-align: center; | |
| font-family: Arial, sans-serif; | |
| margin: 0; | |
| } | |
| figcaption { | |
| color: black; | |
| background-color: white; | |
| border-radius: 5px; | |
| } | |
| img { | |
| background-color: white; | |
| margin: 0; | |
| padding: 0; | |
| border-radius: 6px; | |
| } | |
| ''' | |
| html_content += '</style></head><body>' | |
| for picto_id, lemma in ids: | |
| if picto_id != 0: # ignore invalid IDs | |
| img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png" | |
| html_content += f''' | |
| <figure> | |
| <img src="{img_url}" alt="{lemma}" width="200" height="200"/> | |
| <figcaption>{lemma}</figcaption> | |
| </figure> | |
| ''' | |
| html_content += '</body></html>' | |
| return html_content | |
| def generate_pdf(ids): | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| for picto_id, lemma in ids: | |
| if picto_id != 0: # ignore invalid IDs | |
| img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png" | |
| pdf.image(img_url, x=None, y=None, w=50, h=50) | |
| pdf.ln(55) | |
| pdf.set_font("Arial", size=12) | |
| pdf.cell(200, 10, txt=lemma, ln=True, align='C') | |
| pdf_path = "pictograms.pdf" | |
| pdf.output(pdf_path) | |
| return pdf_path | |
| st.title("Traduction d'une phrase en pictogrammes ARASAAC") | |
| sentence = st.text_input("Entrez une phrase en français:") | |
| if sentence: | |
| inputs = tokenizer(sentence, return_tensors="pt").input_ids | |
| outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) | |
| pred = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| sentence_to_map = process_output_trad(pred) | |
| pictogram_ids = [get_id_picto_from_predicted_lemma(lexicon, lemma) for lemma in sentence_to_map] | |
| html = generate_html(pictogram_ids) | |
| st.components.v1.html(html, height=800, scrolling=True) | |
| # Container to hold the download button | |
| download_container = st.container() | |
| with download_container: | |
| pdf_path = generate_pdf(pictogram_ids) | |
| with open(pdf_path, "rb") as pdf_file: | |
| st.download_button(label="Télécharger la traduction en PDF", data=pdf_file, file_name="pictograms.pdf", mime="application/pdf") |