Spaces:
Runtime error
Runtime error
Commit
·
d1a92d2
1
Parent(s):
4700cbe
added KeyBERT
Browse files- app.py +95 -0
- reqirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from keybert import KeyBERT
|
2 |
+
import streamlit as st
|
3 |
+
import streamlit.components.v1 as components
|
4 |
+
from datasets import load_dataset
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
|
8 |
+
st.set_page_config(page_title="KeyBERT")
|
9 |
+
|
10 |
+
st.title("HF-KeyBERT A front end for KeyBERT")
|
11 |
+
st.caption("By Allen Roush")
|
12 |
+
st.caption("github: https://github.com/Hellisotherpeople/CX_DB8")
|
13 |
+
st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
|
14 |
+
st.header("KeyBERT")
|
15 |
+
st.caption("By Maarten Grootendorst")
|
16 |
+
st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200)
|
17 |
+
st.caption("github: https://github.com/MaartenGr")
|
18 |
+
st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
form = st.sidebar.form("choose_settings")
|
23 |
+
|
24 |
+
form.header("Main Settings")
|
25 |
+
custom_doc = form.checkbox("Use a document from an existing dataset?")
|
26 |
+
if custom_doc:
|
27 |
+
dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
|
28 |
+
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
29 |
+
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
30 |
+
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
|
31 |
+
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
|
32 |
+
index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0)
|
33 |
+
index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2)
|
34 |
+
else:
|
35 |
+
doc = st.text_area("Enter a custom document")
|
36 |
+
|
37 |
+
model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2")
|
38 |
+
form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
|
39 |
+
form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
|
40 |
+
form.form_submit_button("Submit")
|
41 |
+
|
42 |
+
|
43 |
+
@st.cache
|
44 |
+
def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
45 |
+
dataset = load_dataset(path = path, name = name, streaming=streaming)
|
46 |
+
#return list(dataset)
|
47 |
+
dataset_head = dataset[split_name].take(number_of_records)
|
48 |
+
df = pd.DataFrame.from_dict(dataset_head)
|
49 |
+
return df[column_name]
|
50 |
+
|
51 |
+
@st.cache(allow_output_mutation=True)
|
52 |
+
def load_model(model_name):
|
53 |
+
kw_model = KeyBERT(model=model_name)
|
54 |
+
return kw_model
|
55 |
+
|
56 |
+
model = load_model(model_name=model_name)
|
57 |
+
|
58 |
+
if custom_doc:
|
59 |
+
st.header("Original Dataset")
|
60 |
+
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
|
61 |
+
doc = list(df[index_to_analyze_start:index_to_analyze_end])
|
62 |
+
st.write(df)
|
63 |
+
st.header("Indexed Documents")
|
64 |
+
st.write(doc)
|
65 |
+
|
66 |
+
|
67 |
+
form2 = st.sidebar.form("KeyBERT Settings")
|
68 |
+
form2.header("KeyBERT Settings")
|
69 |
+
keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1)
|
70 |
+
keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1)
|
71 |
+
form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases")
|
72 |
+
use_maxsum = form2.checkbox("Max Sum Similarity?", value = False)
|
73 |
+
form2.caption("Max sum modifies the keyphrase algorithim in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.")
|
74 |
+
nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10)
|
75 |
+
form2.caption("Only meaningful if Max Sum Similarity is selected")
|
76 |
+
use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False)
|
77 |
+
form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithim in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases")
|
78 |
+
diversity = form2.number_input("Enter the diversity", value = 0.7)
|
79 |
+
form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithim weighs the results")
|
80 |
+
top_n = form2.number_input("Enter the number of returned keyphrases", value = 10)
|
81 |
+
min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc))
|
82 |
+
form2.caption("Only meaningful if extracting the keyphrases of multiple documents")
|
83 |
+
seed_keywords = form2.text_area("Enter a list of keyword (seperated with space) which will personalize/guide the extracted keywords", value = "")
|
84 |
+
form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results")
|
85 |
+
|
86 |
+
|
87 |
+
form2.form_submit_button("Submit")
|
88 |
+
|
89 |
+
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split())
|
90 |
+
|
91 |
+
st.header("Extracted Keywords/KeyPhrases")
|
92 |
+
st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest")
|
93 |
+
st.caption("That means you should read from the bottom up")
|
94 |
+
st.write(keywords)
|
95 |
+
|
reqirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
keybert
|
2 |
+
streamlit
|
3 |
+
pandas
|
4 |
+
datasets
|