Spaces:
Runtime error
Runtime error
Commit
·
d1a92d2
1
Parent(s):
4700cbe
added KeyBERT
Browse files- app.py +95 -0
- reqirements.txt +4 -0
app.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from keybert import KeyBERT
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import streamlit.components.v1 as components
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
st.set_page_config(page_title="KeyBERT")
|
| 9 |
+
|
| 10 |
+
st.title("HF-KeyBERT A front end for KeyBERT")
|
| 11 |
+
st.caption("By Allen Roush")
|
| 12 |
+
st.caption("github: https://github.com/Hellisotherpeople/CX_DB8")
|
| 13 |
+
st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
|
| 14 |
+
st.header("KeyBERT")
|
| 15 |
+
st.caption("By Maarten Grootendorst")
|
| 16 |
+
st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200)
|
| 17 |
+
st.caption("github: https://github.com/MaartenGr")
|
| 18 |
+
st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
form = st.sidebar.form("choose_settings")
|
| 23 |
+
|
| 24 |
+
form.header("Main Settings")
|
| 25 |
+
custom_doc = form.checkbox("Use a document from an existing dataset?")
|
| 26 |
+
if custom_doc:
|
| 27 |
+
dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
|
| 28 |
+
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
| 29 |
+
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
| 30 |
+
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
|
| 31 |
+
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
|
| 32 |
+
index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0)
|
| 33 |
+
index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2)
|
| 34 |
+
else:
|
| 35 |
+
doc = st.text_area("Enter a custom document")
|
| 36 |
+
|
| 37 |
+
model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2")
|
| 38 |
+
form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
|
| 39 |
+
form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
|
| 40 |
+
form.form_submit_button("Submit")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@st.cache
|
| 44 |
+
def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
| 45 |
+
dataset = load_dataset(path = path, name = name, streaming=streaming)
|
| 46 |
+
#return list(dataset)
|
| 47 |
+
dataset_head = dataset[split_name].take(number_of_records)
|
| 48 |
+
df = pd.DataFrame.from_dict(dataset_head)
|
| 49 |
+
return df[column_name]
|
| 50 |
+
|
| 51 |
+
@st.cache(allow_output_mutation=True)
|
| 52 |
+
def load_model(model_name):
|
| 53 |
+
kw_model = KeyBERT(model=model_name)
|
| 54 |
+
return kw_model
|
| 55 |
+
|
| 56 |
+
model = load_model(model_name=model_name)
|
| 57 |
+
|
| 58 |
+
if custom_doc:
|
| 59 |
+
st.header("Original Dataset")
|
| 60 |
+
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
|
| 61 |
+
doc = list(df[index_to_analyze_start:index_to_analyze_end])
|
| 62 |
+
st.write(df)
|
| 63 |
+
st.header("Indexed Documents")
|
| 64 |
+
st.write(doc)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
form2 = st.sidebar.form("KeyBERT Settings")
|
| 68 |
+
form2.header("KeyBERT Settings")
|
| 69 |
+
keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1)
|
| 70 |
+
keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1)
|
| 71 |
+
form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases")
|
| 72 |
+
use_maxsum = form2.checkbox("Max Sum Similarity?", value = False)
|
| 73 |
+
form2.caption("Max sum modifies the keyphrase algorithim in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.")
|
| 74 |
+
nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10)
|
| 75 |
+
form2.caption("Only meaningful if Max Sum Similarity is selected")
|
| 76 |
+
use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False)
|
| 77 |
+
form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithim in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases")
|
| 78 |
+
diversity = form2.number_input("Enter the diversity", value = 0.7)
|
| 79 |
+
form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithim weighs the results")
|
| 80 |
+
top_n = form2.number_input("Enter the number of returned keyphrases", value = 10)
|
| 81 |
+
min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc))
|
| 82 |
+
form2.caption("Only meaningful if extracting the keyphrases of multiple documents")
|
| 83 |
+
seed_keywords = form2.text_area("Enter a list of keyword (seperated with space) which will personalize/guide the extracted keywords", value = "")
|
| 84 |
+
form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
form2.form_submit_button("Submit")
|
| 88 |
+
|
| 89 |
+
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split())
|
| 90 |
+
|
| 91 |
+
st.header("Extracted Keywords/KeyPhrases")
|
| 92 |
+
st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest")
|
| 93 |
+
st.caption("That means you should read from the bottom up")
|
| 94 |
+
st.write(keywords)
|
| 95 |
+
|
reqirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
keybert
|
| 2 |
+
streamlit
|
| 3 |
+
pandas
|
| 4 |
+
datasets
|