from typing import Dict, Union from gliner import GLiNER import gradio as gr import os model = GLiNER.from_pretrained( "gravitee-io/gliner-pii-detection", token=os.getenv("HUGGINGFACE_TOKEN"), load_onnx_model=True, load_tokenizer=True, onnx_model_file="model.onnx" ) examples = [ [ "Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is jana.k@example.com.", "name, driver_license_number, street_address, email", 0.5, False, ], [ "Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.", "name, street_address, ipv4, api_key, company", 0.5, False, ], [ "Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.", "name, credit_card_number, credit_card_security_code, iban", 0.5, False, ], [ "Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is t.becker@dataflux.de.", "name, employee_id, company, date, email", 0.5, False, ], [ "Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.", "name, street_address, ssn, date_of_birth", 0.5, False, ], [ "Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.", "name, swift_bic_code, bank_routing_number, street_address", 0.5, False, ], [ "Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.", "name, employee_id, date, time, password", 0.5, False, ], [ "Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.", "name, date_of_birth, street_address, bban", 0.5, False, ], [ "Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.", "name, passport_number, street_address, phone_number", 0.5, False, ], [ "Alejandro Torres created his customer account on 2024-08-30 using email ale.torres@correo.mx and ID CUST-MX-1122.", "name, date, email, customer_id", 0.5, False, ], ] def ner( text, labels: str, threshold: float, nested_ner: bool ) -> Dict[str, Union[str, int, float]]: labels = labels.split(",") return { "text": text, "entities": [ { "entity": entity["label"], "word": entity["text"], "start": entity["start"], "end": entity["end"], "score": 0, } for entity in model.predict_entities( text, labels, flat_ner=not nested_ner, threshold=threshold ) ], } with gr.Blocks(title="GLiNER-M-v2.1") as demo: gr.Markdown( """ # Gravitee PII (Personnally Identifiable Information extraction) GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios. """ ) with gr.Accordion("How to run this model locally", open=False): gr.Markdown( """ ## Installation To use this model, you must install the GLiNER Python library: ``` !pip install gliner ``` ## Usage Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`. """ ) gr.Code( """ model = GLiNER.from_pretrained( "gravitee-io/gliner-pii-detection", load_onnx_model=True, load_tokenizer=True, onnx_model_file="model.onnx" ) text = ''' Hey, just a quick update. I talked to David yesterday. He sent over the files from his private email (david.doe@example.com), and we should be careful with his SSN: 123-45-6789. Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890. He mentioned his new address is 123 Maple Street in New York. His PC adress is 192.168.1.100. ''' labels = ["name", "email", "ssn", "api_key", "street_address", "date", "ipv4"] entities = model.predict_entities(text, labels) for entity in entities: print(entity["text"], "=>", entity["label"], "=>", entity["score"]) """ ) input_text = gr.Textbox( value=examples[0][0], label="Text input", placeholder="Enter your text here" ) with gr.Row() as row: labels = gr.Textbox( value=examples[0][1], label="Labels", placeholder="Enter your labels here (comma separated)", scale=2, ) threshold = gr.Slider( 0, 1, value=0.3, step=0.01, label="Threshold", info="Lower the threshold to increase how many entities get predicted.", scale=1, ) nested_ner = gr.Checkbox( value=examples[0][2], label="Nested NER", info="Allow for nested NER?", scale=0, ) output = gr.HighlightedText(label="Predicted Entities") submit_btn = gr.Button("Submit") examples = gr.Examples( examples, fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output, cache_examples=True, ) # Submitting input_text.submit( fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output ) labels.submit( fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output ) threshold.release( fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output ) submit_btn.click( fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output ) nested_ner.change( fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output ) demo.queue() demo.launch(debug=True)