gliner-pii

Running

App Files Files Community

gliner-pii / app.py

MikeG27

Update app.py

ee1da13 verified about 1 month ago

raw

history blame

6.85 kB

	from typing import Dict, Union
	from gliner import GLiNER
	import gradio as gr
	import os

	model = GLiNER.from_pretrained(
	"gravitee-io/gliner-pii-detection",
	token=os.getenv("HUGGINGFACE_TOKEN"),
	load_onnx_model=True,
	load_tokenizer=True, onnx_model_file="model.onnx"
	)

	examples = [
	[
	"Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].",
	"name, driver_license_number, street_address, email",
	0.5,
	False,
	],
	[
	"Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
	"name, street_address, ipv4, api_key, company",
	0.5,
	False,
	],
	[
	"Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
	"name, credit_card_number, credit_card_security_code, iban",
	0.5,
	False,
	],
	[
	"Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].",
	"name, employee_id, company, date, email",
	0.5,
	False,
	],
	[
	"Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
	"name, street_address, ssn, date_of_birth",
	0.5,
	False,
	],
	[
	"Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
	"name, swift_bic_code, bank_routing_number, street_address",
	0.5,
	False,
	],
	[
	"Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
	"name, employee_id, date, time, password",
	0.5,
	False,
	],
	[
	"Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
	"name, date_of_birth, street_address, bban",
	0.5,
	False,
	],
	[
	"Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
	"name, passport_number, street_address, phone_number",
	0.5,
	False,
	],
	[
	"Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.",
	"name, date, email, customer_id",
	0.5,
	False,
	],
	]

	def ner(
	text, labels: str, threshold: float, nested_ner: bool
	) -> Dict[str, Union[str, int, float]]:
	labels = labels.split(",")
	return {
	"text": text,
	"entities": [
	{
	"entity": entity["label"],
	"word": entity["text"],
	"start": entity["start"],
	"end": entity["end"],
	"score": 0,
	}
	for entity in model.predict_entities(
	text, labels, flat_ner=not nested_ner, threshold=threshold
	)
	],
	}


	with gr.Blocks(title="GLiNER-M-v2.1") as demo:
	gr.Markdown(
	"""
	# Gravitee PII (Personnally Identifiable Information extraction)

	GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
	"""

	)
	with gr.Accordion("How to run this model locally", open=False):
	gr.Markdown(
	"""
	## Installation
	To use this model, you must install the GLiNER Python library:
	```
	!pip install gliner
	```

	## Usage
	Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
	"""
	)
	gr.Code(
	"""
	model = GLiNER.from_pretrained(
	"gravitee-io/gliner-pii-detection",
	load_onnx_model=True,
	load_tokenizer=True, onnx_model_file="model.onnx"
	)

	text = '''
	Hey, just a quick update. I talked to David yesterday.
	He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789.
	Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
	He mentioned his new address is 123 Maple Street in New York.
	His PC adress is 192.168.1.100.
	'''

	labels = ["name",
	"email",
	"ssn",
	"api_key",
	"street_address",
	"date",
	"ipv4"]

	entities = model.predict_entities(text, labels)

	for entity in entities:
	print(entity["text"], "=>", entity["label"], "=>", entity["score"])
	"""
	)

	input_text = gr.Textbox(
	value=examples[0][0], label="Text input", placeholder="Enter your text here"
	)
	with gr.Row() as row:
	labels = gr.Textbox(
	value=examples[0][1],
	label="Labels",
	placeholder="Enter your labels here (comma separated)",
	scale=2,
	)
	threshold = gr.Slider(
	0,
	1,
	value=0.3,
	step=0.01,
	label="Threshold",
	info="Lower the threshold to increase how many entities get predicted.",
	scale=1,
	)
	nested_ner = gr.Checkbox(
	value=examples[0][2],
	label="Nested NER",
	info="Allow for nested NER?",
	scale=0,
	)
	output = gr.HighlightedText(label="Predicted Entities")
	submit_btn = gr.Button("Submit")
	examples = gr.Examples(
	examples,
	fn=ner,
	inputs=[input_text, labels, threshold, nested_ner],
	outputs=output,
	cache_examples=True,
	)

	# Submitting
	input_text.submit(
	fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
	)
	labels.submit(
	fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
	)
	threshold.release(
	fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
	)
	submit_btn.click(
	fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
	)
	nested_ner.change(
	fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
	)

	demo.queue()
	demo.launch(debug=True)