gliner-pii

Running

File size: 6,849 Bytes

e8574b8
24e3585
e8574b8
ee1da13
e8574b8
ee1da13
 
 
 
 
 
e8574b8
 
 
ee1da13
 
c59464c
 
e8574b8
5cda7f5
ee1da13
 
c59464c
5cda7f5
 
e8574b8
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
 
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
 
5cda7f5
 
 
e8574b8
 
 
 
 
 
 
 
 
 
 
5cda7f5
 
 
e8574b8
 
 
 
63a56b9
e8574b8
 
ee1da13
e8574b8
 
 
ee1da13
e8574b8
 
 
 
 
24e3585
e8574b8
24e3585
e8574b8
 
 
24e3585
e8574b8
 
 
ee1da13
 
 
 
 
 
e8574b8
ee1da13
 
 
 
 
 
 
e8574b8
ee1da13
 
 
 
 
 
 
e8574b8
4b92f32
e8574b8
4b92f32
ee1da13
e8574b8
 
 
 
 
 
18a08c1
 
 
 
 
5cda7f5
 
 
 
 
63a56b9
5cda7f5
 
 
18a08c1
 
 
 
 
 
 
 
e8574b8
 
 
 
 
5cda7f5
e8574b8
 
 
 
 
5cda7f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8574b8
 
18a08c1

from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
import os

model = GLiNER.from_pretrained(
    "gravitee-io/gliner-pii-detection",
    token=os.getenv("HUGGINGFACE_TOKEN"),
    load_onnx_model=True,
    load_tokenizer=True, onnx_model_file="model.onnx"
)

examples = [
    [
        "Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].",
        "name, driver_license_number, street_address, email",
        0.5,
        False,
    ],
    [
        "Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
        "name, street_address, ipv4, api_key, company",
        0.5,
        False,
    ],
    [
        "Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
        "name, credit_card_number, credit_card_security_code, iban",
        0.5,
        False,
    ],
    [
        "Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].",
        "name, employee_id, company, date, email",
        0.5,
        False,
    ],
    [
        "Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
        "name, street_address, ssn, date_of_birth",
        0.5,
        False,
    ],
    [
        "Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
        "name, swift_bic_code, bank_routing_number, street_address",
        0.5,
        False,
    ],
    [
        "Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
        "name, employee_id, date, time, password",
        0.5,
        False,
    ],
    [
        "Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
        "name, date_of_birth, street_address, bban",
        0.5,
        False,
    ],
    [
        "Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
        "name, passport_number, street_address, phone_number",
        0.5,
        False,
    ],
    [
        "Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.",
        "name, date, email, customer_id",
        0.5,
        False,
    ],
]

def ner(
    text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
    labels = labels.split(",")
    return {
        "text": text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in model.predict_entities(
                text, labels, flat_ner=not nested_ner, threshold=threshold
            )
        ],
    }


with gr.Blocks(title="GLiNER-M-v2.1") as demo:
    gr.Markdown(
        """
        # Gravitee PII (Personnally Identifiable Information extraction)

        GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
        """

    )
    with gr.Accordion("How to run this model locally", open=False):
        gr.Markdown(
            """
            ## Installation
            To use this model, you must install the GLiNER Python library:
            ```
            !pip install gliner
            ```
         
            ## Usage
            Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
            """
        )
        gr.Code(
            """
model = GLiNER.from_pretrained(
    "gravitee-io/gliner-pii-detection",
    load_onnx_model=True,
    load_tokenizer=True, onnx_model_file="model.onnx"
)

text = '''
Hey, just a quick update. I talked to David yesterday. 
He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789.
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
He mentioned his new address is 123 Maple Street in New York. 
His PC adress is 192.168.1.100.
'''

labels = ["name",
          "email",
          "ssn",
          "api_key",
          "street_address",
          "date",
          "ipv4"]

entities = model.predict_entities(text, labels)

for entity in entities:
    print(entity["text"], "=>", entity["label"], "=>", entity["score"])
            """
        )

    input_text = gr.Textbox(
        value=examples[0][0], label="Text input", placeholder="Enter your text here"
    )
    with gr.Row() as row:
        labels = gr.Textbox(
            value=examples[0][1],
            label="Labels",
            placeholder="Enter your labels here (comma separated)",
            scale=2,
        )
        threshold = gr.Slider(
            0,
            1,
            value=0.3,
            step=0.01,
            label="Threshold",
            info="Lower the threshold to increase how many entities get predicted.",
            scale=1,
        )
        nested_ner = gr.Checkbox(
            value=examples[0][2],
            label="Nested NER",
            info="Allow for nested NER?",
            scale=0,
        )
    output = gr.HighlightedText(label="Predicted Entities")
    submit_btn = gr.Button("Submit")
    examples = gr.Examples(
        examples,
        fn=ner,
        inputs=[input_text, labels, threshold, nested_ner],
        outputs=output,
        cache_examples=True,
    )

    # Submitting
    input_text.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    labels.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    threshold.release(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    submit_btn.click(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    nested_ner.change(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )

demo.queue()
demo.launch(debug=True)