Spaces:

arthrod
/

tucano-voraz-old

Running on Zero

App Files Files Community

tucano-voraz-old / app.py

arthrod

Update app.py

83f47e1 verified 5 months ago

raw

history blame

8.57 kB

	import gradio as gr
	import torch
	import spaces
	import time
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Load model and tokenizer
	@spaces.GPU
	def load_model():
	model_name = "arthrod/tucano_voraz_cwb-com-prompts-apr-04"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.padding_side = 'left'
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto"
	)
	return model, tokenizer

	model, tokenizer = load_model()

	# Main prediction function with the exact format you specified
	@spaces.GPU
	def predict(message, history):
	# Apply chat template
	messages = [{"role": "user", "content": message}]
	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize inputs
	model_inputs = tokenizer(inputs, padding=True, return_tensors="pt").to(model.device)

	# Create a streaming effect
	partial_message = ""
	input_length = model_inputs.input_ids.shape[1]

	# Generate without sampling for deterministic output
	with torch.no_grad():
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=512,
	do_sample=False,
	temperature=None, # Remove temperature
	top_p=None, # Remove top_p
	top_k=None
	)

	# Extract just the generated part (not the input)
	generated_part = generated_ids[0, input_length:]

	# Decode the output
	output = tokenizer.decode(generated_part, skip_special_tokens=True)

	# Simulate streaming for better user experience
	for i in range(min(len(output), 100)): # Limit to reasonable length
	partial_message = output[:i+1]
	time.sleep(0.02) # Small delay for streaming effect
	yield partial_message

	# Yield the complete message
	yield output

	# Example texts demonstrating different PII types
	examples = [
	"Meu nome é Ricardo Almeida e moro na Rua das Flores, 123, em Curitiba. Meu CPF é 123.456.789-00 e meu telefone é (41) 98765-4321.",
	"A paciente Maria da Silva, nascida em 15/03/1980, apresentou resultados alterados no exame de sangue. Favor contatar pelo celular 11 99876-5432.",
	"O funcionário José Roberto Santos, portador do RG 12.345.678-9, está autorizado a acessar o prédio da empresa Tecnologia Brasil LTDA.",
	"Declaração de Imposto de Renda do contribuinte Roberto Carlos Magalhães, CPF 987.654.321-00, residente na Av. Paulista, 1578, São Paulo-SP.",
	"Segue o número do cartão de crédito para o pagamento: 5432-1098-7654-3210, titular Ana Beatriz Oliveira, validade 12/25, código 123."
	]

	# PII types with explanations
	pii_types = [
	{"name": "CPF/CNPJ", "tag": "SSN_CPF", "example": "123.456.789-00 → [SSN_CPF]"},
	{"name": "RG", "tag": "ID_RG", "example": "12.345.678-9 → [ID_RG]"},
	{"name": "Nome", "tag": "FIRST_NAME/MIDDLE_NAME/LAST_NAME", "example": "João Silva → [FIRST_NAME] [LAST_NAME]"},
	{"name": "Endereço", "tag": "STREET_NAME/BUILDING_NB", "example": "Rua Aurora, 123 → [STREET_NAME], [BUILDING_NB]"},
	{"name": "Bairro", "tag": "NEIGHBORHOOD", "example": "Jardim Paulista → [NEIGHBORHOOD]"},
	{"name": "Cidade", "tag": "CITY", "example": "São Paulo → [CITY]"},
	{"name": "Estado", "tag": "STATE/STATE_ABBR", "example": "São Paulo/SP → [STATE]/[STATE_ABBR]"},
	{"name": "CEP", "tag": "ZIPCODE_CEP", "example": "01234-567 → [ZIPCODE_CEP]"},
	{"name": "Telefone", "tag": "PHONE", "example": "(11) 98765-4321 → [PHONE]"},
	{"name": "Data de nascimento", "tag": "BIRTHDATE", "example": "15/03/1980 → [BIRTHDATE]"},
	{"name": "Cartão de crédito", "tag": "CREDITCARD", "example": "5432-1098-7654-3210 → [CREDITCARD]"},
	{"name": "PIS/PASEP", "tag": "SOCIAL_NB_PIS", "example": "123.45678.90-1 → [SOCIAL_NB_PIS]"},
	{"name": "Dados médicos", "tag": "MEDICAL_DATA", "example": "Diagnóstico de hipertensão → [MEDICAL_DATA]"},
	{"name": "Opinião política", "tag": "POLITICAL_OPINION", "example": "Apoiador do partido X → [POLITICAL_OPINION]"},
	{"name": "Convicção religiosa", "tag": "RELIGIOUS_CONVICTION", "example": "Praticante de religião Y → [RELIGIOUS_CONVICTION]"}
	]

	# Create Gradio Interface
	with gr.Blocks(css="""
	.gradio-container {max-width: 1200px !important; margin-left: auto !important; margin-right: auto !important;}
	.examples {margin-top: 10px !important;}
	.pii-table {margin-top: 20px !important; margin-bottom: 20px !important;}
	.footer {text-align: center; margin-top: 20px !important; padding: 10px !important; border-top: 1px solid #eee !important;}
	.pii-card {background-color: #f9f9f9; border-radius: 8px; padding: 15px; margin-bottom: 10px;}
	.header-image {display: block; margin: 0 auto; max-width: 120px; margin-bottom: 10px;}
	.assistant-message {border-left: 3px solid #ffd700 !important; background-color: #fffdf7 !important;}
	.user-message {background-color: #f5f5f5 !important; border-left: 3px solid #6c757d !important;}
	""") as demo:

	gr.HTML("""
	<div style="text-align: center; margin-bottom: 20px;">
	<img src="https://i.imgur.com/UGwNbsT.png" alt="Tucano Voraz Logo" class="header-image">
	<h1 style="margin-top: 5px; margin-bottom: 10px;">Tucano Voraz</h1>
	<h3 style="margin-top: 0; color: #666;">Sistema de Anonimização e Remoção de Dados Sensíveis</h3>
	<p>by <a href="mailto:[email protected]">Arthur Souza Rodrigues</a></p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("""
	## Sobre o Projeto

	Tucano Voraz é uma ferramenta de anonimização de texto projetada para identificar e remover
	informações pessoais sensíveis (PII) de documentos em português. Em um mundo onde vazamentos
	de dados são cada vez mais comuns, proteger informações pessoais tornou-se essencial.

	### Como Funciona:

	1. Cole seu texto contendo dados sensíveis
	2. O modelo identificará automaticamente PIIs
	3. Receberá o texto com os dados sensíveis substituídos por tags

	### Aplicações:

	- Conformidade com LGPD e regulamentos de privacidade
	- Compartilhamento seguro de documentos
	- Preparação de dados para processamento por terceiros
	- Publicação de documentos em ambientes públicos
	""")

	with gr.Accordion("📋 Tipos de Dados Sensíveis Detectados", open=False):
	pii_html = "<div class='pii-table'><div style='display: grid; grid-template-columns: repeat(auto-fill, minmax(350px, 1fr)); gap: 10px;'>"

	for pii in pii_types:
	pii_html += f"""
	<div class='pii-card'>
	<h4 style='margin-top: 0; margin-bottom: 5px;'>{pii['name']}</h4>
	<div style='color: #666; font-size: 0.9em; margin-bottom: 5px;'>Tag: <code>{pii['tag']}</code></div>
	<div style='font-size: 0.85em;'>Exemplo: <code>{pii['example']}</code></div>
	</div>
	"""

	pii_html += "</div></div>"
	gr.HTML(pii_html)

	chat_interface = gr.ChatInterface(
	fn=predict,
	type="messages",
	examples=examples,
	title="",
	chatbot=gr.Chatbot(
	height=500,
	bubble_full_width=False,
	show_share_button=True,
	show_copy_button=True,
	layout="bubble",
	avatar_images=(None, "https://i.imgur.com/Ptd2AoL.png")
	),
	textbox=gr.Textbox(
	placeholder="Insira seu texto para anonimização...",
	lines=3,
	max_lines=10,
	show_copy_button=True,
	container=False,
	scale=7
	)
	)

	gr.HTML("""
	<div class="footer">
	<p>🔒 Todos os dados são processados localmente - não armazenamos textos ou dados sensíveis</p>
	<p>Siga o desenvolvimento: <a href="https://www.linkedin.com/in/arthrod/detail/recent-activity/" target="_blank">LinkedIn</a> \| Desenvolvido com ❤️ usando Gradio e HuggingFace</p>
	</div>
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()