Paula Leonova
commited on
Commit
·
51fcc5c
1
Parent(s):
e452a5c
Update spinners and sentence chunking
Browse files
app.py
CHANGED
@@ -32,6 +32,11 @@ with st.form(key='my_form'):
|
|
32 |
if text_input == display_text:
|
33 |
text_input = example_text
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
|
36 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
37 |
|
@@ -45,51 +50,76 @@ with st.form(key='my_form'):
|
|
45 |
submit_button = st.form_submit_button(label='Submit')
|
46 |
|
47 |
|
48 |
-
|
|
|
49 |
start = time.time()
|
50 |
summarizer = md.load_summary_model()
|
51 |
-
|
52 |
|
53 |
-
with st.spinner('Loading pretrained classifier mnli model...'):
|
54 |
start = time.time()
|
55 |
-
classifier = md.load_model()
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
|
59 |
if submit_button:
|
60 |
-
if len(
|
61 |
-
st.write(
|
62 |
-
|
63 |
-
|
64 |
-
my_expander = st.expander(label='Expand to see summary generation details')
|
65 |
-
with my_expander:
|
66 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
67 |
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
st.markdown(
|
80 |
-
|
81 |
-
|
82 |
-
summary.
|
83 |
-
|
84 |
-
|
85 |
-
#
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
94 |
with st.spinner('Matching labels...'):
|
95 |
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
@@ -146,7 +176,7 @@ if submit_button:
|
|
146 |
section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
|
147 |
data_headers = ['scores_from_summary', 'scores_from_full_text']
|
148 |
for i in range(0,2):
|
149 |
-
st.markdown(f"
|
150 |
report = classification_report(y_true = data2[['is_true_label']],
|
151 |
y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
|
152 |
output_dict=True)
|
@@ -154,5 +184,5 @@ if submit_button:
|
|
154 |
st.markdown(f"Threshold set for: {threshold_value}")
|
155 |
st.dataframe(df_report)
|
156 |
|
157 |
-
|
158 |
-
|
|
|
32 |
if text_input == display_text:
|
33 |
text_input = example_text
|
34 |
|
35 |
+
gen_keywords = st.radio(
|
36 |
+
"Generate keywords from text?",
|
37 |
+
('Yes', 'No')
|
38 |
+
)
|
39 |
+
|
40 |
labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
|
41 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
42 |
|
|
|
50 |
submit_button = st.form_submit_button(label='Submit')
|
51 |
|
52 |
|
53 |
+
|
54 |
+
with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
|
55 |
start = time.time()
|
56 |
summarizer = md.load_summary_model()
|
57 |
+
s_time = round(time.time() - start,4)
|
58 |
|
|
|
59 |
start = time.time()
|
60 |
+
classifier = md.load_model()
|
61 |
+
c_time = round(time.time() - start,4)
|
62 |
+
|
63 |
+
st.success(f'Time taken to load: summarizer mnli model {s_time}s & classifier mnli model {c_time}s')
|
64 |
+
|
65 |
+
# with st.spinner('Loading pretrained classifier mnli model...'):
|
66 |
+
# start = time.time()
|
67 |
+
# classifier = md.load_model()
|
68 |
+
# st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
69 |
|
70 |
|
71 |
if submit_button:
|
72 |
+
if len(text_input) == 0:
|
73 |
+
st.write("Enter some text to generate a summary")
|
74 |
+
else:
|
75 |
+
with st.spinner('Breaking up text into more reasonable chunks (tranformers cannot exceed a 1024 token max)...'):
|
|
|
|
|
76 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
77 |
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
78 |
+
# For each chunk of sentences (within the token max)
|
79 |
+
text_chunks = []
|
80 |
+
for n in range(0, len(nested_sentences)):
|
81 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
82 |
+
text_chunks.append(tc)
|
83 |
+
|
84 |
+
with st.spinner('Generating summaries for text chunks...'):
|
85 |
+
|
86 |
+
my_expander = st.expander(label='Expand to see summary generation details')
|
87 |
+
with my_expander:
|
88 |
+
summary = []
|
89 |
+
st.markdown("### Text Chunk & Summaries")
|
90 |
+
# st.markdown("_Breaks up the original text into sections with complete sentences totaling \
|
91 |
+
# less than 1024 tokens, a requirement for the summarizer. Each block of text is than summarized separately \
|
92 |
+
# and then combined at the very end to generate the final summary._")
|
93 |
+
|
94 |
+
# # For each chunk of sentences (within the token max), generate a summary
|
95 |
+
# for n in range(0, len(nested_sentences)):
|
96 |
+
# text_chunk = " ".join(map(str, nested_sentences[n]))
|
97 |
+
# st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
98 |
+
# st.markdown(text_chunk)
|
99 |
+
|
100 |
+
for num_chunk, text_chunk in enumerate(text_chunks):
|
101 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
102 |
+
st.markdown(text_chunk)
|
103 |
+
|
104 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
105 |
+
summary.append(chunk_summary)
|
106 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
107 |
+
st.markdown(chunk_summary)
|
108 |
+
# Combine all the summaries into a list and compress into one document, again
|
109 |
+
final_summary = " \n\n".join(list(summary))
|
110 |
+
|
111 |
+
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
112 |
+
st.markdown("### Combined Summary")
|
113 |
+
st.markdown(final_summary)
|
114 |
+
|
115 |
+
# if gen_keywords == 'Yes':
|
116 |
+
# st.markdown("### Top Keywords")
|
117 |
+
# with st.spinner("Generating keywords from text...")
|
118 |
+
# keywords =
|
119 |
+
|
120 |
+
if len(text_input) == 0 or len(labels) == 0:
|
121 |
+
st.write('Enter some text and at least one possible topic to see predictions.')
|
122 |
+
else:
|
123 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
124 |
with st.spinner('Matching labels...'):
|
125 |
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
|
|
176 |
section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
|
177 |
data_headers = ['scores_from_summary', 'scores_from_full_text']
|
178 |
for i in range(0,2):
|
179 |
+
st.markdown(f"###### {section_header_description[i]}")
|
180 |
report = classification_report(y_true = data2[['is_true_label']],
|
181 |
y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
|
182 |
output_dict=True)
|
|
|
184 |
st.markdown(f"Threshold set for: {threshold_value}")
|
185 |
st.dataframe(df_report)
|
186 |
|
187 |
+
st.success('All done!')
|
188 |
+
st.balloons()
|