Spaces:
Runtime error
Runtime error
File size: 5,404 Bytes
bbb171e cc99942 bbb171e cc99942 bbb171e cc99942 bbb171e 8ac6b3b 87ced4a 8ac6b3b bbb171e cc99942 bbb171e cc99942 bbb171e 0062690 bbb171e 2b49144 da00f10 cc99942 bbb171e 0062690 cc99942 bbb171e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
from hashformers import TransformerWordSegmenter as WordSegmenter
import pandas as pd
article_string = "Author: <a href=\"https://huggingface.co/ruanchaves\">Ruan Chaves Rodrigues</a>. Read more about the <a href=\"https://github.com/ruanchaves/hashformers\">Hashformers library</a>."
app_title = "Hashtag segmentation"
app_description = """
Hashtag segmentation is the task of automatically adding spaces between the words on a hashtag.
This app uses the <a href=\"https://github.com/ruanchaves/hashformers\">Hashformers library</a> to suggest segmentations for hashtags.
Enter a hashtag or pick one from the examples below. The app will suggest the best segmentation for the hashtag.
In the advanced settings, decreasing the slider values will make the app faster, but it may also reduce its accuracy.
"""
app_examples = [
["#cristianoronaldo", "portuguese"],
["#madridsinfiltros", "spanish"],
["#kuenstlicheintelligenz", "german"],
["#dadscare", "english (fast)"],
["#nowthatcherisdead", "english"],
]
output_json_component_description = {"": ""}
model_dict = {
"english": WordSegmenter(
segmenter_model_name_or_path="gpt2",
reranker_model_name_or_path="bert-base-uncased",
segmenter_device="cpu",
),
"english (fast)": WordSegmenter(
segmenter_model_name_or_path="distilgpt2",
reranker_model_name_or_path="distilbert-base-uncased",
segmenter_device="cpu",
),
"spanish": WordSegmenter(
segmenter_model_name_or_path="mrm8488/spanish-gpt2",
reranker_model_name_or_path="dccuchile/bert-base-spanish-wwm-cased",
segmenter_device="cpu",
),
"portuguese": WordSegmenter(
segmenter_model_name_or_path="pierreguillou/gpt2-small-portuguese",
reranker_model_name_or_path="neuralmind/bert-base-portuguese-cased",
segmenter_device="cpu",
),
"german": WordSegmenter(
segmenter_model_name_or_path="dbmdz/german-gpt2",
reranker_model_name_or_path="bert-base-german-cased",
segmenter_device="cpu",
),
}
language_list = list(model_dict.keys())
def format_dataframe(df):
if not isinstance(df, pd.DataFrame):
return df
df = df[["segmentation", "score"]]
df["score"] = df["score"].apply(lambda x: 1/x)
df["score"] = df["score"].apply(lambda x: round(x, 4))
return df
def convert_to_score_dict(df):
if not isinstance(df, pd.DataFrame):
return {}
df = df[["segmentation", "score"]]
return df.set_index("segmentation").T.to_dict("records")[0]
def get_candidates_df(candidates, segmenter_score_dict, reranker_score_dict ):
candidates_df = []
for candidate in candidates:
candidates_df.append(
{
"segmentation": candidate,
"segmenter score": segmenter_score_dict.get(candidate, 0),
"reranker score": reranker_score_dict.get(candidate, 0),
})
candidates_df = pd.DataFrame(candidates_df)
return candidates_df
def parse_candidates(candidates):
if not candidates:
return []
candidates = candidates.split(",")
candidates = [c.strip() for c in candidates]
return candidates
def predict(s1, language, use_reranker, topk, steps):
hashtag_list = [s1]
if language:
chosen_model = model_dict[language]
else:
chosen_model = model_dict["english (fast)"]
if not all([topk, steps]):
return None, None
segmentation = chosen_model.segment(hashtag_list, use_reranker=use_reranker, return_ranks=True, topk=topk, steps=steps)
segmenter_df = format_dataframe(segmentation.segmenter_rank)
reranker_df = format_dataframe(segmentation.reranker_rank)
if not use_reranker:
candidates_list = segmenter_df.head(3)["segmentation"].tolist()
else:
candidates_list = reranker_df.head(3)["segmentation"].tolist()
top_segmentation = segmentation.output[0]
segmenter_score_dict = convert_to_score_dict(segmenter_df)
reranker_score_dict = convert_to_score_dict(reranker_df)
top_segmentation_df = get_candidates_df([top_segmentation], segmenter_score_dict, reranker_score_dict)
candidates_df = get_candidates_df(candidates_list, segmenter_score_dict, reranker_score_dict)
output_df = pd.concat([top_segmentation_df, candidates_df], axis=0)
if use_reranker:
output_df = output_df.sort_values(by="reranker score", ascending=False)
else:
output_df = output_df.sort_values(by="segmenter score", ascending=False)
output_df = output_df.drop_duplicates(subset="segmentation", keep="first")
return "# **{0}**".format(top_segmentation), output_df
inputs = [
gr.Textbox(label="Hashtag"),
gr.Dropdown(language_list, label="Language", value="english (fast)"),
gr.Checkbox(label="Use reranker", value=False),
gr.Slider(0, 100, value=20, label="Advanced setting - Beamsearch: Number of beams"),
gr.Slider(0, 100, value=13, label="Advanced setting - Maximum number of spaces allowed")
]
outputs = [
gr.Markdown(label="Suggested segmentation"),
gr.DataFrame(label="Top alternatives"),
]
gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
description=app_description,
examples=app_examples,
article = article_string).launch() |