Paula Leonova
commited on
Commit
·
51fcc5c
1
Parent(s):
e452a5c
Update spinners and sentence chunking
Browse files
app.py
CHANGED
|
@@ -32,6 +32,11 @@ with st.form(key='my_form'):
|
|
| 32 |
if text_input == display_text:
|
| 33 |
text_input = example_text
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
|
| 36 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
| 37 |
|
|
@@ -45,51 +50,76 @@ with st.form(key='my_form'):
|
|
| 45 |
submit_button = st.form_submit_button(label='Submit')
|
| 46 |
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
start = time.time()
|
| 50 |
summarizer = md.load_summary_model()
|
| 51 |
-
|
| 52 |
|
| 53 |
-
with st.spinner('Loading pretrained classifier mnli model...'):
|
| 54 |
start = time.time()
|
| 55 |
-
classifier = md.load_model()
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
if submit_button:
|
| 60 |
-
if len(
|
| 61 |
-
st.write(
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
my_expander = st.expander(label='Expand to see summary generation details')
|
| 65 |
-
with my_expander:
|
| 66 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 67 |
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
st.markdown(
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
summary.
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
#
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
| 94 |
with st.spinner('Matching labels...'):
|
| 95 |
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
|
@@ -146,7 +176,7 @@ if submit_button:
|
|
| 146 |
section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
|
| 147 |
data_headers = ['scores_from_summary', 'scores_from_full_text']
|
| 148 |
for i in range(0,2):
|
| 149 |
-
st.markdown(f"
|
| 150 |
report = classification_report(y_true = data2[['is_true_label']],
|
| 151 |
y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
|
| 152 |
output_dict=True)
|
|
@@ -154,5 +184,5 @@ if submit_button:
|
|
| 154 |
st.markdown(f"Threshold set for: {threshold_value}")
|
| 155 |
st.dataframe(df_report)
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
|
|
|
| 32 |
if text_input == display_text:
|
| 33 |
text_input = example_text
|
| 34 |
|
| 35 |
+
gen_keywords = st.radio(
|
| 36 |
+
"Generate keywords from text?",
|
| 37 |
+
('Yes', 'No')
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
|
| 41 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
| 42 |
|
|
|
|
| 50 |
submit_button = st.form_submit_button(label='Submit')
|
| 51 |
|
| 52 |
|
| 53 |
+
|
| 54 |
+
with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
|
| 55 |
start = time.time()
|
| 56 |
summarizer = md.load_summary_model()
|
| 57 |
+
s_time = round(time.time() - start,4)
|
| 58 |
|
|
|
|
| 59 |
start = time.time()
|
| 60 |
+
classifier = md.load_model()
|
| 61 |
+
c_time = round(time.time() - start,4)
|
| 62 |
+
|
| 63 |
+
st.success(f'Time taken to load: summarizer mnli model {s_time}s & classifier mnli model {c_time}s')
|
| 64 |
+
|
| 65 |
+
# with st.spinner('Loading pretrained classifier mnli model...'):
|
| 66 |
+
# start = time.time()
|
| 67 |
+
# classifier = md.load_model()
|
| 68 |
+
# st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
| 69 |
|
| 70 |
|
| 71 |
if submit_button:
|
| 72 |
+
if len(text_input) == 0:
|
| 73 |
+
st.write("Enter some text to generate a summary")
|
| 74 |
+
else:
|
| 75 |
+
with st.spinner('Breaking up text into more reasonable chunks (tranformers cannot exceed a 1024 token max)...'):
|
|
|
|
|
|
|
| 76 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 77 |
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
| 78 |
+
# For each chunk of sentences (within the token max)
|
| 79 |
+
text_chunks = []
|
| 80 |
+
for n in range(0, len(nested_sentences)):
|
| 81 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
| 82 |
+
text_chunks.append(tc)
|
| 83 |
+
|
| 84 |
+
with st.spinner('Generating summaries for text chunks...'):
|
| 85 |
+
|
| 86 |
+
my_expander = st.expander(label='Expand to see summary generation details')
|
| 87 |
+
with my_expander:
|
| 88 |
+
summary = []
|
| 89 |
+
st.markdown("### Text Chunk & Summaries")
|
| 90 |
+
# st.markdown("_Breaks up the original text into sections with complete sentences totaling \
|
| 91 |
+
# less than 1024 tokens, a requirement for the summarizer. Each block of text is than summarized separately \
|
| 92 |
+
# and then combined at the very end to generate the final summary._")
|
| 93 |
+
|
| 94 |
+
# # For each chunk of sentences (within the token max), generate a summary
|
| 95 |
+
# for n in range(0, len(nested_sentences)):
|
| 96 |
+
# text_chunk = " ".join(map(str, nested_sentences[n]))
|
| 97 |
+
# st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
| 98 |
+
# st.markdown(text_chunk)
|
| 99 |
+
|
| 100 |
+
for num_chunk, text_chunk in enumerate(text_chunks):
|
| 101 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
| 102 |
+
st.markdown(text_chunk)
|
| 103 |
+
|
| 104 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
| 105 |
+
summary.append(chunk_summary)
|
| 106 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
| 107 |
+
st.markdown(chunk_summary)
|
| 108 |
+
# Combine all the summaries into a list and compress into one document, again
|
| 109 |
+
final_summary = " \n\n".join(list(summary))
|
| 110 |
+
|
| 111 |
+
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
| 112 |
+
st.markdown("### Combined Summary")
|
| 113 |
+
st.markdown(final_summary)
|
| 114 |
+
|
| 115 |
+
# if gen_keywords == 'Yes':
|
| 116 |
+
# st.markdown("### Top Keywords")
|
| 117 |
+
# with st.spinner("Generating keywords from text...")
|
| 118 |
+
# keywords =
|
| 119 |
+
|
| 120 |
+
if len(text_input) == 0 or len(labels) == 0:
|
| 121 |
+
st.write('Enter some text and at least one possible topic to see predictions.')
|
| 122 |
+
else:
|
| 123 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
| 124 |
with st.spinner('Matching labels...'):
|
| 125 |
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
|
|
|
| 176 |
section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
|
| 177 |
data_headers = ['scores_from_summary', 'scores_from_full_text']
|
| 178 |
for i in range(0,2):
|
| 179 |
+
st.markdown(f"###### {section_header_description[i]}")
|
| 180 |
report = classification_report(y_true = data2[['is_true_label']],
|
| 181 |
y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
|
| 182 |
output_dict=True)
|
|
|
|
| 184 |
st.markdown(f"Threshold set for: {threshold_value}")
|
| 185 |
st.dataframe(df_report)
|
| 186 |
|
| 187 |
+
st.success('All done!')
|
| 188 |
+
st.balloons()
|