File size: 6,849 Bytes
e8574b8
24e3585
e8574b8
ee1da13
e8574b8
ee1da13
 
 
 
 
 
e8574b8
 
 
ee1da13
 
c59464c
 
e8574b8
5cda7f5
ee1da13
 
c59464c
5cda7f5
 
e8574b8
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
 
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
ee1da13
 
c59464c
18a08c1
e8574b8
 
 
5cda7f5
 
 
e8574b8
 
 
 
 
 
 
 
 
 
 
5cda7f5
 
 
e8574b8
 
 
 
63a56b9
e8574b8
 
ee1da13
e8574b8
 
 
ee1da13
e8574b8
 
 
 
 
24e3585
e8574b8
24e3585
e8574b8
 
 
24e3585
e8574b8
 
 
ee1da13
 
 
 
 
 
e8574b8
ee1da13
 
 
 
 
 
 
e8574b8
ee1da13
 
 
 
 
 
 
e8574b8
4b92f32
e8574b8
4b92f32
ee1da13
e8574b8
 
 
 
 
 
18a08c1
 
 
 
 
5cda7f5
 
 
 
 
63a56b9
5cda7f5
 
 
18a08c1
 
 
 
 
 
 
 
e8574b8
 
 
 
 
5cda7f5
e8574b8
 
 
 
 
5cda7f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8574b8
 
18a08c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
import os

model = GLiNER.from_pretrained(
    "gravitee-io/gliner-pii-detection",
    token=os.getenv("HUGGINGFACE_TOKEN"),
    load_onnx_model=True,
    load_tokenizer=True, onnx_model_file="model.onnx"
)

examples = [
    [
        "Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is [email protected].",
        "name, driver_license_number, street_address, email",
        0.5,
        False,
    ],
    [
        "Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
        "name, street_address, ipv4, api_key, company",
        0.5,
        False,
    ],
    [
        "Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
        "name, credit_card_number, credit_card_security_code, iban",
        0.5,
        False,
    ],
    [
        "Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is [email protected].",
        "name, employee_id, company, date, email",
        0.5,
        False,
    ],
    [
        "Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
        "name, street_address, ssn, date_of_birth",
        0.5,
        False,
    ],
    [
        "Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
        "name, swift_bic_code, bank_routing_number, street_address",
        0.5,
        False,
    ],
    [
        "Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
        "name, employee_id, date, time, password",
        0.5,
        False,
    ],
    [
        "Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
        "name, date_of_birth, street_address, bban",
        0.5,
        False,
    ],
    [
        "Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
        "name, passport_number, street_address, phone_number",
        0.5,
        False,
    ],
    [
        "Alejandro Torres created his customer account on 2024-08-30 using email [email protected] and ID CUST-MX-1122.",
        "name, date, email, customer_id",
        0.5,
        False,
    ],
]

def ner(
    text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
    labels = labels.split(",")
    return {
        "text": text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in model.predict_entities(
                text, labels, flat_ner=not nested_ner, threshold=threshold
            )
        ],
    }


with gr.Blocks(title="GLiNER-M-v2.1") as demo:
    gr.Markdown(
        """
        # Gravitee PII (Personnally Identifiable Information extraction)

        GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
        """

    )
    with gr.Accordion("How to run this model locally", open=False):
        gr.Markdown(
            """
            ## Installation
            To use this model, you must install the GLiNER Python library:
            ```
            !pip install gliner
            ```
         
            ## Usage
            Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
            """
        )
        gr.Code(
            """
model = GLiNER.from_pretrained(
    "gravitee-io/gliner-pii-detection",
    load_onnx_model=True,
    load_tokenizer=True, onnx_model_file="model.onnx"
)

text = '''
Hey, just a quick update. I talked to David yesterday. 
He sent over the files from his private email ([email protected]), and we should be careful with his SSN: 123-45-6789.
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
He mentioned his new address is 123 Maple Street in New York. 
His PC adress is 192.168.1.100.
'''

labels = ["name",
          "email",
          "ssn",
          "api_key",
          "street_address",
          "date",
          "ipv4"]

entities = model.predict_entities(text, labels)

for entity in entities:
    print(entity["text"], "=>", entity["label"], "=>", entity["score"])
            """
        )

    input_text = gr.Textbox(
        value=examples[0][0], label="Text input", placeholder="Enter your text here"
    )
    with gr.Row() as row:
        labels = gr.Textbox(
            value=examples[0][1],
            label="Labels",
            placeholder="Enter your labels here (comma separated)",
            scale=2,
        )
        threshold = gr.Slider(
            0,
            1,
            value=0.3,
            step=0.01,
            label="Threshold",
            info="Lower the threshold to increase how many entities get predicted.",
            scale=1,
        )
        nested_ner = gr.Checkbox(
            value=examples[0][2],
            label="Nested NER",
            info="Allow for nested NER?",
            scale=0,
        )
    output = gr.HighlightedText(label="Predicted Entities")
    submit_btn = gr.Button("Submit")
    examples = gr.Examples(
        examples,
        fn=ner,
        inputs=[input_text, labels, threshold, nested_ner],
        outputs=output,
        cache_examples=True,
    )

    # Submitting
    input_text.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    labels.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    threshold.release(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    submit_btn.click(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    nested_ner.change(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )

demo.queue()
demo.launch(debug=True)