Suzana's picture
Update app.py
44cb2f4 verified
import gradio as gr
import pandas as pd
from pathlib import Path
from huggingface_hub import HfApi, Repository
# Allowed tags
LABELS = {"PER", "ORG", "LOC", "O"}
token_df = pd.DataFrame() # global store
# ───────────────────────── token explode ───────────────────────
def explode(df: pd.DataFrame) -> pd.DataFrame:
"""Return DataFrame(example_id, token, label='O')."""
if "text" in df.columns:
lines = df["text"].astype(str)
else: # user / assistant dialogs
lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
rows = []
for sid, line in enumerate(lines, start=0): # ensure unique 0,1,2,...
for tok in line.split():
rows.append({"example_id": sid, "token": tok, "label": "O"})
return pd.DataFrame(rows)
# ───────────────────────── callbacks ───────────────────────────
def load_csv(file):
global token_df
df = pd.read_csv(file.name)
valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
if not valid:
msg = "❌ CSV needs a `text` column **or** both `user` and `assistant` columns."
return None, msg, *(gr.update(visible=False),) * 3
token_df = explode(df)
return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
f"βœ… {len(df)} rows β†’ {len(token_df)} tokens.",
gr.update(visible=True), # show buttons
gr.update(visible=False), # reset download links
gr.update(visible=False))
def save_table(tbl):
global token_df
token_df = pd.DataFrame(tbl, columns=["example_id", "token", "label"])
bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique()
return "πŸ’Ύ Saved." if bad.size == 0 else f"⚠️ Unknown label(s): {', '.join(bad)}"
def export_tokens():
fname = "raw_tokens.csv"
token_df.to_csv(fname, index=False)
return gr.update(value=fname, visible=True)
def export_iob():
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["example_id"], r["label"]
if lbl == "O":
iob.append("O"); prev[sid] = None
else:
iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
prev[sid] = lbl
out = token_df.copy(); out["iob"] = iob
fname = "ner_iob.csv"; out.to_csv(fname, index=False)
return gr.update(value=fname, visible=True)
def push_to_hub(repo_id, token):
try:
HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
local = Path(repo_id.replace("/", "_"))
if local.exists():
for f in local.iterdir(): f.unlink()
local.rmdir()
repo = Repository(str(local), clone_from=repo_id,
repo_type="dataset", use_auth_token=token)
token_df.to_csv(local / "data.csv", index=False)
repo.push_to_hub("Add annotated NER data")
return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
except Exception as e:
return f"❌ {e}"
# ───────────────────────── UI ──────────────────────────────────
with gr.Blocks() as demo:
gr.Markdown("# 🏷️ Label It! Mini-NER")
gr.Markdown("**Step 1** – upload CSV (columns: `text` **or** `user`+`assistant`).")
with gr.Row():
csv_file = gr.File(file_types=[".csv"])
load_btn = gr.Button("Load")
status = gr.Textbox(interactive=False)
tok_table = gr.Dataframe(headers=["example_id", "token", "label"],
datatype=["number", "str", "str"],
visible=False)
with gr.Row(visible=False) as buttons:
save_btn = gr.Button("πŸ’Ύ Save")
tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
file_tok = gr.File(visible=False)
file_iob = gr.File(visible=False)
with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False, visible=False) as acc:
repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
push_btn = gr.Button("Push")
push_out = gr.Textbox(interactive=False)
# wiring
load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob])
load_btn.click(lambda: gr.update(visible=True), None, acc)
save_btn.click(save_table, tok_table, status)
tok_btn.click(export_tokens, outputs=file_tok)
iob_btn.click(export_iob, outputs=file_iob)
push_btn.click(push_to_hub, [repo_in, token_in], push_out)
gr.Markdown("**Step 2** – label tokens (`PER`, `ORG`, `LOC`, `O`) ➜ Save ➜ Download / Push.")
demo.launch()