Rainsilves commited on
Commit
d1a92d2
·
1 Parent(s): 4700cbe

added KeyBERT

Browse files
Files changed (2) hide show
  1. app.py +95 -0
  2. reqirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from keybert import KeyBERT
2
+ import streamlit as st
3
+ import streamlit.components.v1 as components
4
+ from datasets import load_dataset
5
+ import pandas as pd
6
+
7
+
8
+ st.set_page_config(page_title="KeyBERT")
9
+
10
+ st.title("HF-KeyBERT A front end for KeyBERT")
11
+ st.caption("By Allen Roush")
12
+ st.caption("github: https://github.com/Hellisotherpeople/CX_DB8")
13
+ st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
14
+ st.header("KeyBERT")
15
+ st.caption("By Maarten Grootendorst")
16
+ st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200)
17
+ st.caption("github: https://github.com/MaartenGr")
18
+ st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
19
+
20
+
21
+
22
+ form = st.sidebar.form("choose_settings")
23
+
24
+ form.header("Main Settings")
25
+ custom_doc = form.checkbox("Use a document from an existing dataset?")
26
+ if custom_doc:
27
+ dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
28
+ dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
29
+ split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
30
+ number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
31
+ column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
32
+ index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0)
33
+ index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2)
34
+ else:
35
+ doc = st.text_area("Enter a custom document")
36
+
37
+ model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2")
38
+ form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
39
+ form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
40
+ form.form_submit_button("Submit")
41
+
42
+
43
+ @st.cache
44
+ def load_and_process_data(path, name, streaming, split_name, number_of_records):
45
+ dataset = load_dataset(path = path, name = name, streaming=streaming)
46
+ #return list(dataset)
47
+ dataset_head = dataset[split_name].take(number_of_records)
48
+ df = pd.DataFrame.from_dict(dataset_head)
49
+ return df[column_name]
50
+
51
+ @st.cache(allow_output_mutation=True)
52
+ def load_model(model_name):
53
+ kw_model = KeyBERT(model=model_name)
54
+ return kw_model
55
+
56
+ model = load_model(model_name=model_name)
57
+
58
+ if custom_doc:
59
+ st.header("Original Dataset")
60
+ df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
61
+ doc = list(df[index_to_analyze_start:index_to_analyze_end])
62
+ st.write(df)
63
+ st.header("Indexed Documents")
64
+ st.write(doc)
65
+
66
+
67
+ form2 = st.sidebar.form("KeyBERT Settings")
68
+ form2.header("KeyBERT Settings")
69
+ keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1)
70
+ keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1)
71
+ form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases")
72
+ use_maxsum = form2.checkbox("Max Sum Similarity?", value = False)
73
+ form2.caption("Max sum modifies the keyphrase algorithim in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.")
74
+ nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10)
75
+ form2.caption("Only meaningful if Max Sum Similarity is selected")
76
+ use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False)
77
+ form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithim in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases")
78
+ diversity = form2.number_input("Enter the diversity", value = 0.7)
79
+ form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithim weighs the results")
80
+ top_n = form2.number_input("Enter the number of returned keyphrases", value = 10)
81
+ min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc))
82
+ form2.caption("Only meaningful if extracting the keyphrases of multiple documents")
83
+ seed_keywords = form2.text_area("Enter a list of keyword (seperated with space) which will personalize/guide the extracted keywords", value = "")
84
+ form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results")
85
+
86
+
87
+ form2.form_submit_button("Submit")
88
+
89
+ keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split())
90
+
91
+ st.header("Extracted Keywords/KeyPhrases")
92
+ st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest")
93
+ st.caption("That means you should read from the bottom up")
94
+ st.write(keywords)
95
+
reqirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ keybert
2
+ streamlit
3
+ pandas
4
+ datasets