Spaces:
Running
Running
File size: 6,849 Bytes
e8574b8 24e3585 e8574b8 ee1da13 e8574b8 ee1da13 e8574b8 ee1da13 c59464c e8574b8 5cda7f5 ee1da13 c59464c 5cda7f5 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 ee1da13 c59464c 18a08c1 e8574b8 5cda7f5 e8574b8 5cda7f5 e8574b8 63a56b9 e8574b8 ee1da13 e8574b8 ee1da13 e8574b8 24e3585 e8574b8 24e3585 e8574b8 24e3585 e8574b8 ee1da13 e8574b8 ee1da13 e8574b8 ee1da13 e8574b8 4b92f32 e8574b8 4b92f32 ee1da13 e8574b8 18a08c1 5cda7f5 63a56b9 5cda7f5 18a08c1 e8574b8 5cda7f5 e8574b8 5cda7f5 e8574b8 18a08c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
import os
model = GLiNER.from_pretrained(
"gravitee-io/gliner-pii-detection",
token=os.getenv("HUGGINGFACE_TOKEN"),
load_onnx_model=True,
load_tokenizer=True, onnx_model_file="model.onnx"
)
examples = [
[
"Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].",
"name, driver_license_number, street_address, email",
0.5,
False,
],
[
"Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
"name, street_address, ipv4, api_key, company",
0.5,
False,
],
[
"Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
"name, credit_card_number, credit_card_security_code, iban",
0.5,
False,
],
[
"Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].",
"name, employee_id, company, date, email",
0.5,
False,
],
[
"Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
"name, street_address, ssn, date_of_birth",
0.5,
False,
],
[
"Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
"name, swift_bic_code, bank_routing_number, street_address",
0.5,
False,
],
[
"Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
"name, employee_id, date, time, password",
0.5,
False,
],
[
"Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
"name, date_of_birth, street_address, bban",
0.5,
False,
],
[
"Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
"name, passport_number, street_address, phone_number",
0.5,
False,
],
[
"Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.",
"name, date, email, customer_id",
0.5,
False,
],
]
def ner(
text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
labels = labels.split(",")
return {
"text": text,
"entities": [
{
"entity": entity["label"],
"word": entity["text"],
"start": entity["start"],
"end": entity["end"],
"score": 0,
}
for entity in model.predict_entities(
text, labels, flat_ner=not nested_ner, threshold=threshold
)
],
}
with gr.Blocks(title="GLiNER-M-v2.1") as demo:
gr.Markdown(
"""
# Gravitee PII (Personnally Identifiable Information extraction)
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
"""
)
with gr.Accordion("How to run this model locally", open=False):
gr.Markdown(
"""
## Installation
To use this model, you must install the GLiNER Python library:
```
!pip install gliner
```
## Usage
Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
"""
)
gr.Code(
"""
model = GLiNER.from_pretrained(
"gravitee-io/gliner-pii-detection",
load_onnx_model=True,
load_tokenizer=True, onnx_model_file="model.onnx"
)
text = '''
Hey, just a quick update. I talked to David yesterday.
He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789.
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
He mentioned his new address is 123 Maple Street in New York.
His PC adress is 192.168.1.100.
'''
labels = ["name",
"email",
"ssn",
"api_key",
"street_address",
"date",
"ipv4"]
entities = model.predict_entities(text, labels)
for entity in entities:
print(entity["text"], "=>", entity["label"], "=>", entity["score"])
"""
)
input_text = gr.Textbox(
value=examples[0][0], label="Text input", placeholder="Enter your text here"
)
with gr.Row() as row:
labels = gr.Textbox(
value=examples[0][1],
label="Labels",
placeholder="Enter your labels here (comma separated)",
scale=2,
)
threshold = gr.Slider(
0,
1,
value=0.3,
step=0.01,
label="Threshold",
info="Lower the threshold to increase how many entities get predicted.",
scale=1,
)
nested_ner = gr.Checkbox(
value=examples[0][2],
label="Nested NER",
info="Allow for nested NER?",
scale=0,
)
output = gr.HighlightedText(label="Predicted Entities")
submit_btn = gr.Button("Submit")
examples = gr.Examples(
examples,
fn=ner,
inputs=[input_text, labels, threshold, nested_ner],
outputs=output,
cache_examples=True,
)
# Submitting
input_text.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
labels.submit(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
threshold.release(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
submit_btn.click(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
nested_ner.change(
fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
)
demo.queue()
demo.launch(debug=True)
|