Spaces:

Hellisotherpeople
/

HF-KeyBERT

Runtime error

App Files Files Community

Rainsilves commited on Jan 24, 2022

Commit

d1a92d2

1 Parent(s): 4700cbe

added KeyBERT

Browse files

Files changed (2) hide show

app.py +95 -0
reqirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from keybert import KeyBERT
+import streamlit as st
+import streamlit.components.v1 as components
+from datasets import load_dataset
+import pandas as pd
+st.set_page_config(page_title="KeyBERT")
+st.title("HF-KeyBERT A front end for KeyBERT")
+st.caption("By Allen Roush")
+st.caption("github: https://github.com/Hellisotherpeople/CX_DB8")
+st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
+st.header("KeyBERT")
+st.caption("By Maarten Grootendorst")
+st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200)
+st.caption("github: https://github.com/MaartenGr")
+st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
+form = st.sidebar.form("choose_settings")
+form.header("Main Settings")
+custom_doc = form.checkbox("Use a document from an existing dataset?")
+if custom_doc:
+    dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
+    dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
+    split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
+    number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
+    column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
+    index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0)
+    index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2)
+else:
+    doc = st.text_area("Enter a custom document")
+model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2")
+form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
+form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
+form.form_submit_button("Submit")
+@st.cache
+def load_and_process_data(path, name, streaming, split_name, number_of_records):
+    dataset = load_dataset(path = path, name = name, streaming=streaming)
+    #return list(dataset)
+    dataset_head = dataset[split_name].take(number_of_records)
+    df = pd.DataFrame.from_dict(dataset_head)
+    return df[column_name]
+@st.cache(allow_output_mutation=True)
+def load_model(model_name):
+    kw_model = KeyBERT(model=model_name)
+    return kw_model
+model = load_model(model_name=model_name)
+if custom_doc:
+    st.header("Original Dataset")
+    df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
+    doc = list(df[index_to_analyze_start:index_to_analyze_end])
+    st.write(df)
+st.header("Indexed Documents")
+st.write(doc)
+form2 = st.sidebar.form("KeyBERT Settings")
+form2.header("KeyBERT Settings")
+keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1)
+keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1)
+form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases")
+use_maxsum = form2.checkbox("Max Sum Similarity?", value = False)
+form2.caption("Max sum modifies the keyphrase algorithim in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.")
+nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10)
+form2.caption("Only meaningful if Max Sum Similarity is selected")
+use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False)
+form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithim in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases")
+diversity = form2.number_input("Enter the diversity", value = 0.7)
+form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithim weighs the results")
+top_n = form2.number_input("Enter the number of returned keyphrases", value = 10)
+min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc))
+form2.caption("Only meaningful if extracting the keyphrases of multiple documents")
+seed_keywords = form2.text_area("Enter a list of keyword (seperated with space) which will personalize/guide the extracted keywords", value = "")
+form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results")
+form2.form_submit_button("Submit")
+keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split())
+st.header("Extracted Keywords/KeyPhrases")
+st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest")
+st.caption("That means you should read from the bottom up")
+st.write(keywords)

reqirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+keybert
+streamlit
+pandas
+datasets