Spaces:
Running
Running
from typing import Dict, Union | |
from gliner import GLiNER | |
import gradio as gr | |
import os | |
model = GLiNER.from_pretrained( | |
"gravitee-io/gliner-pii-detection", | |
token=os.getenv("HUGGINGFACE_TOKEN"), | |
load_onnx_model=True, | |
load_tokenizer=True, onnx_model_file="model.onnx" | |
) | |
examples = [ | |
[ | |
"Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].", | |
"name, driver_license_number, street_address, email", | |
0.5, | |
False, | |
], | |
[ | |
"Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.", | |
"name, street_address, ipv4, api_key, company", | |
0.5, | |
False, | |
], | |
[ | |
"Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.", | |
"name, credit_card_number, credit_card_security_code, iban", | |
0.5, | |
False, | |
], | |
[ | |
"Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].", | |
"name, employee_id, company, date, email", | |
0.5, | |
False, | |
], | |
[ | |
"Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.", | |
"name, street_address, ssn, date_of_birth", | |
0.5, | |
False, | |
], | |
[ | |
"Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.", | |
"name, swift_bic_code, bank_routing_number, street_address", | |
0.5, | |
False, | |
], | |
[ | |
"Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.", | |
"name, employee_id, date, time, password", | |
0.5, | |
False, | |
], | |
[ | |
"Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.", | |
"name, date_of_birth, street_address, bban", | |
0.5, | |
False, | |
], | |
[ | |
"Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.", | |
"name, passport_number, street_address, phone_number", | |
0.5, | |
False, | |
], | |
[ | |
"Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.", | |
"name, date, email, customer_id", | |
0.5, | |
False, | |
], | |
] | |
def ner( | |
text, labels: str, threshold: float, nested_ner: bool | |
) -> Dict[str, Union[str, int, float]]: | |
labels = labels.split(",") | |
return { | |
"text": text, | |
"entities": [ | |
{ | |
"entity": entity["label"], | |
"word": entity["text"], | |
"start": entity["start"], | |
"end": entity["end"], | |
"score": 0, | |
} | |
for entity in model.predict_entities( | |
text, labels, flat_ner=not nested_ner, threshold=threshold | |
) | |
], | |
} | |
with gr.Blocks(title="GLiNER-M-v2.1") as demo: | |
gr.Markdown( | |
""" | |
# Gravitee PII (Personnally Identifiable Information extraction) | |
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios. | |
""" | |
) | |
with gr.Accordion("How to run this model locally", open=False): | |
gr.Markdown( | |
""" | |
## Installation | |
To use this model, you must install the GLiNER Python library: | |
``` | |
!pip install gliner | |
``` | |
## Usage | |
Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`. | |
""" | |
) | |
gr.Code( | |
""" | |
model = GLiNER.from_pretrained( | |
"gravitee-io/gliner-pii-detection", | |
load_onnx_model=True, | |
load_tokenizer=True, onnx_model_file="model.onnx" | |
) | |
text = ''' | |
Hey, just a quick update. I talked to David yesterday. | |
He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789. | |
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890. | |
He mentioned his new address is 123 Maple Street in New York. | |
His PC adress is 192.168.1.100. | |
''' | |
labels = ["name", | |
"email", | |
"ssn", | |
"api_key", | |
"street_address", | |
"date", | |
"ipv4"] | |
entities = model.predict_entities(text, labels) | |
for entity in entities: | |
print(entity["text"], "=>", entity["label"], "=>", entity["score"]) | |
""" | |
) | |
input_text = gr.Textbox( | |
value=examples[0][0], label="Text input", placeholder="Enter your text here" | |
) | |
with gr.Row() as row: | |
labels = gr.Textbox( | |
value=examples[0][1], | |
label="Labels", | |
placeholder="Enter your labels here (comma separated)", | |
scale=2, | |
) | |
threshold = gr.Slider( | |
0, | |
1, | |
value=0.3, | |
step=0.01, | |
label="Threshold", | |
info="Lower the threshold to increase how many entities get predicted.", | |
scale=1, | |
) | |
nested_ner = gr.Checkbox( | |
value=examples[0][2], | |
label="Nested NER", | |
info="Allow for nested NER?", | |
scale=0, | |
) | |
output = gr.HighlightedText(label="Predicted Entities") | |
submit_btn = gr.Button("Submit") | |
examples = gr.Examples( | |
examples, | |
fn=ner, | |
inputs=[input_text, labels, threshold, nested_ner], | |
outputs=output, | |
cache_examples=True, | |
) | |
# Submitting | |
input_text.submit( | |
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output | |
) | |
labels.submit( | |
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output | |
) | |
threshold.release( | |
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output | |
) | |
submit_btn.click( | |
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output | |
) | |
nested_ner.change( | |
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output | |
) | |
demo.queue() | |
demo.launch(debug=True) | |