Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import sentencepiece as spm | |
| examples = [ | |
| "Hello, world!", | |
| "European Central bank has announced cuts.", | |
| "This document is a summary of the European Public Assessment Report (EPAR).", | |
| "En el presente documento se resume el Informe Público Europeo de Evaluación (EPAR).", | |
| "Solution for injection", | |
| "How is Abilify used?", | |
| "¿Para qué se utiliza Abilify?", | |
| "Tratado de la Unión Europea y Tratado de Funcionamiento de la Unión Europea"] | |
| def greet(sentence): | |
| sp_ecb = spm.SentencePieceProcessor() | |
| sp_ecb.load('bpe-ECB.model') | |
| sp_emea = spm.SentencePieceProcessor() | |
| sp_emea.load('bpe-EMEA.model') | |
| return ("<div class='output'>" + | |
| "<div><b>ECB dataset</b></br>" + | |
| ("<span style='background-color: yellow;'> • </span>".join(sp_ecb.encode_as_pieces(sentence))) + | |
| "</div>" + | |
| "<div style='padding-top: 1em;'><b>EMEA dataset</b></br>" + | |
| ("<span style='background-color: yellow;'> • </span>".join(sp_emea.encode_as_pieces(sentence))) + | |
| "</div>" + | |
| "</div>") | |
| description = """ | |
| Demo for SentencePiece. The model is trained on ECB and EMEA datasets in order to see the differences in tokenization. | |
| The ECB dataset contains financial news articles, while the EMEA dataset contains medical articles. | |
| The texts included in the training are in English and Spanish, for this reason the tokenisation will work best for these languages. | |
| You can try some other languages and see how the tokenisation works. However, make sure you use only Latin characters. | |
| The model did not see any non-Latin characters during training, so the results for languages that do not use Latin characters will be unpredictable. | |
| Both variants are trained with 5000 vocab size. | |
| """ | |
| demo = gr.Interface(fn=greet, inputs="text", outputs="html", | |
| examples=examples, title="SentencePiece", | |
| description=description, | |
| cache_examples="lazy", | |
| concurrency_limit=30, | |
| css=".output {font-size: 150%;}") | |
| demo.launch(share=True) | |