Spaces:

arthrod
/

tucano-voraz-old

Running on Zero

App Files Files Community

arthrod commited on Apr 6

Commit

83f47e1

verified ·

1 Parent(s): 5c2478b

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -53

app.py CHANGED Viewed

@@ -1,64 +1,185 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
         messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
         ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+import spaces
+import time
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load model and tokenizer
+@spaces.GPU
+def load_model():
+    model_name = "arthrod/tucano_voraz_cwb-com-prompts-apr-04"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.padding_side = 'left'
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    return model, tokenizer
+model, tokenizer = load_model()
+# Main prediction function with the exact format you specified
+@spaces.GPU
+def predict(message, history):
+    # Apply chat template
+    messages = [{"role": "user", "content": message}]
+    inputs = tokenizer.apply_chat_template(
         messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Tokenize inputs
+    model_inputs = tokenizer(inputs, padding=True, return_tensors="pt").to(model.device)
+    # Create a streaming effect
+    partial_message = ""
+    input_length = model_inputs.input_ids.shape[1]
+    # Generate without sampling for deterministic output
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **model_inputs,
+            max_new_tokens=512,
+            do_sample=False,
+            temperature=None,  # Remove temperature
+            top_p=None,  # Remove top_p
+            top_k=None
+        )
+    # Extract just the generated part (not the input)
+    generated_part = generated_ids[0, input_length:]
+    # Decode the output
+    output = tokenizer.decode(generated_part, skip_special_tokens=True)
+    # Simulate streaming for better user experience
+    for i in range(min(len(output), 100)):  # Limit to reasonable length
+        partial_message = output[:i+1]
+        time.sleep(0.02)  # Small delay for streaming effect
+        yield partial_message
+    # Yield the complete message
+    yield output
+# Example texts demonstrating different PII types
+examples = [
+    "Meu nome é Ricardo Almeida e moro na Rua das Flores, 123, em Curitiba. Meu CPF é 123.456.789-00 e meu telefone é (41) 98765-4321.",
+    "A paciente Maria da Silva, nascida em 15/03/1980, apresentou resultados alterados no exame de sangue. Favor contatar pelo celular 11 99876-5432.",
+    "O funcionário José Roberto Santos, portador do RG 12.345.678-9, está autorizado a acessar o prédio da empresa Tecnologia Brasil LTDA.",
+    "Declaração de Imposto de Renda do contribuinte Roberto Carlos Magalhães, CPF 987.654.321-00, residente na Av. Paulista, 1578, São Paulo-SP.",
+    "Segue o número do cartão de crédito para o pagamento: 5432-1098-7654-3210, titular Ana Beatriz Oliveira, validade 12/25, código 123."
+]
+# PII types with explanations
+pii_types = [
+    {"name": "CPF/CNPJ", "tag": "SSN_CPF", "example": "123.456.789-00 → [SSN_CPF]"},
+    {"name": "RG", "tag": "ID_RG", "example": "12.345.678-9 → [ID_RG]"},
+    {"name": "Nome", "tag": "FIRST_NAME/MIDDLE_NAME/LAST_NAME", "example": "João Silva → [FIRST_NAME] [LAST_NAME]"},
+    {"name": "Endereço", "tag": "STREET_NAME/BUILDING_NB", "example": "Rua Aurora, 123 → [STREET_NAME], [BUILDING_NB]"},
+    {"name": "Bairro", "tag": "NEIGHBORHOOD", "example": "Jardim Paulista → [NEIGHBORHOOD]"},
+    {"name": "Cidade", "tag": "CITY", "example": "São Paulo → [CITY]"},
+    {"name": "Estado", "tag": "STATE/STATE_ABBR", "example": "São Paulo/SP → [STATE]/[STATE_ABBR]"},
+    {"name": "CEP", "tag": "ZIPCODE_CEP", "example": "01234-567 → [ZIPCODE_CEP]"},
+    {"name": "Telefone", "tag": "PHONE", "example": "(11) 98765-4321 → [PHONE]"},
+    {"name": "Data de nascimento", "tag": "BIRTHDATE", "example": "15/03/1980 → [BIRTHDATE]"},
+    {"name": "Cartão de crédito", "tag": "CREDITCARD", "example": "5432-1098-7654-3210 → [CREDITCARD]"},
+    {"name": "PIS/PASEP", "tag": "SOCIAL_NB_PIS", "example": "123.45678.90-1 → [SOCIAL_NB_PIS]"},
+    {"name": "Dados médicos", "tag": "MEDICAL_DATA", "example": "Diagnóstico de hipertensão → [MEDICAL_DATA]"},
+    {"name": "Opinião política", "tag": "POLITICAL_OPINION", "example": "Apoiador do partido X → [POLITICAL_OPINION]"},
+    {"name": "Convicção religiosa", "tag": "RELIGIOUS_CONVICTION", "example": "Praticante de religião Y → [RELIGIOUS_CONVICTION]"}
+]
+# Create Gradio Interface
+with gr.Blocks(css="""
+    .gradio-container {max-width: 1200px !important; margin-left: auto !important; margin-right: auto !important;}
+    .examples {margin-top: 10px !important;}
+    .pii-table {margin-top: 20px !important; margin-bottom: 20px !important;}
+    .footer {text-align: center; margin-top: 20px !important; padding: 10px !important; border-top: 1px solid #eee !important;}
+    .pii-card {background-color: #f9f9f9; border-radius: 8px; padding: 15px; margin-bottom: 10px;}
+    .header-image {display: block; margin: 0 auto; max-width: 120px; margin-bottom: 10px;}
+    .assistant-message {border-left: 3px solid #ffd700 !important; background-color: #fffdf7 !important;}
+    .user-message {background-color: #f5f5f5 !important; border-left: 3px solid #6c757d !important;}
+""") as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 20px;">
+        <img src="https://i.imgur.com/UGwNbsT.png" alt="Tucano Voraz Logo" class="header-image">
+        <h1 style="margin-top: 5px; margin-bottom: 10px;">Tucano Voraz</h1>
+        <h3 style="margin-top: 0; color: #666;">Sistema de Anonimização e Remoção de Dados Sensíveis</h3>
+        <p>by <a href="mailto:[email protected]">Arthur Souza Rodrigues</a></p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("""
+            ## Sobre o Projeto
+            **Tucano Voraz** é uma ferramenta de anonimização de texto projetada para identificar e remover
+            informações pessoais sensíveis (PII) de documentos em português. Em um mundo onde vazamentos
+            de dados são cada vez mais comuns, proteger informações pessoais tornou-se essencial.
+            ### Como Funciona:
+            1. Cole seu texto contendo dados sensíveis
+            2. O modelo identificará automaticamente PIIs
+            3. Receberá o texto com os dados sensíveis substituídos por tags
+            ### Aplicações:
+            - Conformidade com LGPD e regulamentos de privacidade
+            - Compartilhamento seguro de documentos
+            - Preparação de dados para processamento por terceiros
+            - Publicação de documentos em ambientes públicos
+            """)
+            with gr.Accordion("📋 Tipos de Dados Sensíveis Detectados", open=False):
+                pii_html = "<div class='pii-table'><div style='display: grid; grid-template-columns: repeat(auto-fill, minmax(350px, 1fr)); gap: 10px;'>"
+                for pii in pii_types:
+                    pii_html += f"""
+                    <div class='pii-card'>
+                        <h4 style='margin-top: 0; margin-bottom: 5px;'>{pii['name']}</h4>
+                        <div style='color: #666; font-size: 0.9em; margin-bottom: 5px;'>Tag: <code>{pii['tag']}</code></div>
+                        <div style='font-size: 0.85em;'>Exemplo: <code>{pii['example']}</code></div>
+                    </div>
+                    """
+                pii_html += "</div></div>"
+                gr.HTML(pii_html)
+    chat_interface = gr.ChatInterface(
+        fn=predict,
+        type="messages",
+        examples=examples,
+        title="",
+        chatbot=gr.Chatbot(
+            height=500,
+            bubble_full_width=False,
+            show_share_button=True,
+            show_copy_button=True,
+            layout="bubble",
+            avatar_images=(None, "https://i.imgur.com/Ptd2AoL.png")
         ),
+        textbox=gr.Textbox(
+            placeholder="Insira seu texto para anonimização...",
+            lines=3,
+            max_lines=10,
+            show_copy_button=True,
+            container=False,
+            scale=7
+        )
+    )
+    gr.HTML("""
+    <div class="footer">
+        <p>🔒 Todos os dados são processados localmente - não armazenamos textos ou dados sensíveis</p>
+        <p>Siga o desenvolvimento: <a href="https://www.linkedin.com/in/arthrod/detail/recent-activity/" target="_blank">LinkedIn</a> | Desenvolvido com ❤️ usando Gradio e HuggingFace</p>
+    </div>
+    """)
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()