Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
from huggingface_hub import HfApi, Repository | |
# Allowed tags | |
LABELS = {"PER", "ORG", "LOC", "O"} | |
token_df = pd.DataFrame() # global store | |
# βββββββββββββββββββββββββ token explode βββββββββββββββββββββββ | |
def explode(df: pd.DataFrame) -> pd.DataFrame: | |
"""Return DataFrame(example_id, token, label='O').""" | |
if "text" in df.columns: | |
lines = df["text"].astype(str) | |
else: # user / assistant dialogs | |
lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1) | |
rows = [] | |
for sid, line in enumerate(lines, start=0): # ensure unique 0,1,2,... | |
for tok in line.split(): | |
rows.append({"example_id": sid, "token": tok, "label": "O"}) | |
return pd.DataFrame(rows) | |
# βββββββββββββββββββββββββ callbacks βββββββββββββββββββββββββββ | |
def load_csv(file): | |
global token_df | |
df = pd.read_csv(file.name) | |
valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns)) | |
if not valid: | |
msg = "β CSV needs a `text` column **or** both `user` and `assistant` columns." | |
return None, msg, *(gr.update(visible=False),) * 3 | |
token_df = explode(df) | |
return (gr.update(value=token_df, visible=True, row_count=len(token_df)), | |
f"β {len(df)} rows β {len(token_df)} tokens.", | |
gr.update(visible=True), # show buttons | |
gr.update(visible=False), # reset download links | |
gr.update(visible=False)) | |
def save_table(tbl): | |
global token_df | |
token_df = pd.DataFrame(tbl, columns=["example_id", "token", "label"]) | |
bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique() | |
return "πΎ Saved." if bad.size == 0 else f"β οΈ Unknown label(s): {', '.join(bad)}" | |
def export_tokens(): | |
fname = "raw_tokens.csv" | |
token_df.to_csv(fname, index=False) | |
return gr.update(value=fname, visible=True) | |
def export_iob(): | |
iob, prev = [], {} | |
for _, r in token_df.iterrows(): | |
sid, lbl = r["example_id"], r["label"] | |
if lbl == "O": | |
iob.append("O"); prev[sid] = None | |
else: | |
iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl) | |
prev[sid] = lbl | |
out = token_df.copy(); out["iob"] = iob | |
fname = "ner_iob.csv"; out.to_csv(fname, index=False) | |
return gr.update(value=fname, visible=True) | |
def push_to_hub(repo_id, token): | |
try: | |
HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True) | |
local = Path(repo_id.replace("/", "_")) | |
if local.exists(): | |
for f in local.iterdir(): f.unlink() | |
local.rmdir() | |
repo = Repository(str(local), clone_from=repo_id, | |
repo_type="dataset", use_auth_token=token) | |
token_df.to_csv(local / "data.csv", index=False) | |
repo.push_to_hub("Add annotated NER data") | |
return f"π https://huggingface.co/datasets/{repo_id}" | |
except Exception as e: | |
return f"β {e}" | |
# βββββββββββββββββββββββββ UI ββββββββββββββββββββββββββββββββββ | |
with gr.Blocks() as demo: | |
gr.Markdown("# π·οΈ Label It! Mini-NER") | |
gr.Markdown("**Step 1** β upload CSV (columns: `text` **or** `user`+`assistant`).") | |
with gr.Row(): | |
csv_file = gr.File(file_types=[".csv"]) | |
load_btn = gr.Button("Load") | |
status = gr.Textbox(interactive=False) | |
tok_table = gr.Dataframe(headers=["example_id", "token", "label"], | |
datatype=["number", "str", "str"], | |
visible=False) | |
with gr.Row(visible=False) as buttons: | |
save_btn = gr.Button("πΎ Save") | |
tok_btn = gr.Button("β¬οΈ Tokens CSV") | |
iob_btn = gr.Button("β¬οΈ IOB CSV") | |
file_tok = gr.File(visible=False) | |
file_iob = gr.File(visible=False) | |
with gr.Accordion("π¦ Push to Hugging Face Hub", open=False, visible=False) as acc: | |
repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password") | |
push_btn = gr.Button("Push") | |
push_out = gr.Textbox(interactive=False) | |
# wiring | |
load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob]) | |
load_btn.click(lambda: gr.update(visible=True), None, acc) | |
save_btn.click(save_table, tok_table, status) | |
tok_btn.click(export_tokens, outputs=file_tok) | |
iob_btn.click(export_iob, outputs=file_iob) | |
push_btn.click(push_to_hub, [repo_in, token_in], push_out) | |
gr.Markdown("**Step 2** β label tokens (`PER`, `ORG`, `LOC`, `O`) β Save β Download / Push.") | |
demo.launch() | |