gliner-pii / app.py
MikeG27's picture
Update app.py
ee1da13 verified
raw
history blame
6.85 kB
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
import os
model = GLiNER.from_pretrained(
"gravitee-io/gliner-pii-detection",
token=os.getenv("HUGGINGFACE_TOKEN"),
load_onnx_model=True,
load_tokenizer=True, onnx_model_file="model.onnx"
)
examples = [
[
"Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].",
"name, driver_license_number, street_address, email",
0.5,
False,
],
[
"Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
"name, street_address, ipv4, api_key, company",
0.5,
False,
],
[
"Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
"name, credit_card_number, credit_card_security_code, iban",
0.5,
False,
],
[
"Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].",
"name, employee_id, company, date, email",
0.5,
False,
],
[
"Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
"name, street_address, ssn, date_of_birth",
0.5,
False,
],
[
"Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
"name, swift_bic_code, bank_routing_number, street_address",
0.5,
False,
],
[
"Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
"name, employee_id, date, time, password",
0.5,
False,
],
[
"Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
"name, date_of_birth, street_address, bban",
0.5,
False,
],
[
"Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
"name, passport_number, street_address, phone_number",
0.5,
False,
],
[
"Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.",
"name, date, email, customer_id",
0.5,
False,
],
]
def ner(
text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
labels = labels.split(",")
return {
"text": text,
"entities": [
{
"entity": entity["label"],
"word": entity["text"],
"start": entity["start"],
"end": entity["end"],
"score": 0,
}
for entity in model.predict_entities(
text, labels, flat_ner=not nested_ner, threshold=threshold
)
],
}
with gr.Blocks(title="GLiNER-M-v2.1") as demo:
gr.Markdown(
"""
# Gravitee PII (Personnally Identifiable Information extraction)
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
"""
)
with gr.Accordion("How to run this model locally", open=False):
gr.Markdown(
"""
## Installation
To use this model, you must install the GLiNER Python library:
```
!pip install gliner
```
## Usage
Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
"""
)
gr.Code(
"""
model = GLiNER.from_pretrained(
"gravitee-io/gliner-pii-detection",
load_onnx_model=True,
load_tokenizer=True, onnx_model_file="model.onnx"
)
text = '''
Hey, just a quick update. I talked to David yesterday.
He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789.
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
He mentioned his new address is 123 Maple Street in New York.
His PC adress is 192.168.1.100.
'''
labels = ["name",
"email",
"ssn",
"api_key",
"street_address",
"date",
"ipv4"]
entities = model.predict_entities(text, labels)
for entity in entities:
print(entity["text"], "=>", entity["label"], "=>", entity["score"])
"""
)
input_text = gr.Textbox(
value=examples[0][0], label="Text input", placeholder="Enter your text here"
)
with gr.Row() as row:
labels = gr.Textbox(
value=examples[0][1],
label="Labels",
placeholder="Enter your labels here (comma separated)",
scale=2,
)
threshold = gr.Slider(
0,
1,
value=0.3,
step=0.01,
label="Threshold",
info="Lower the threshold to increase how many entities get predicted.",
scale=1,
)
nested_ner = gr.Checkbox(
value=examples[0][2],
label="Nested NER",
info="Allow for nested NER?",
scale=0,
)
output = gr.HighlightedText(label="Predicted Entities")
submit_btn = gr.Button("Submit")
examples = gr.Examples(
examples,
fn=ner,
inputs=[input_text, labels, threshold, nested_ner],
outputs=output,
cache_examples=True,
)
# Submitting
input_text.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
labels.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
threshold.release(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
submit_btn.click(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
nested_ner.change(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
demo.queue()
demo.launch(debug=True)