Spaces:
Runtime error
Runtime error
feat: multifile processing
Browse files- app.py +45 -37
- data_viewer.py +9 -6
- process_text.py +66 -0
app.py
CHANGED
|
@@ -5,11 +5,12 @@ Dashboard for showcasing extraction of text metrics with textdescriptives.
|
|
| 5 |
|
| 6 |
from io import StringIO
|
| 7 |
|
| 8 |
-
import
|
| 9 |
import streamlit as st
|
| 10 |
import textdescriptives as td
|
| 11 |
|
| 12 |
from data_viewer import DataViewer
|
|
|
|
| 13 |
from options import (
|
| 14 |
all_model_size_options_pretty_to_short,
|
| 15 |
available_model_size_options,
|
|
@@ -28,7 +29,7 @@ with col1:
|
|
| 28 |
with col2:
|
| 29 |
st.image(
|
| 30 |
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
|
| 31 |
-
width=125
|
| 32 |
)
|
| 33 |
|
| 34 |
st.write(
|
|
@@ -46,8 +47,8 @@ st.write(
|
|
| 46 |
|
| 47 |
st.caption(
|
| 48 |
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
| 49 |
-
"calculating a large variety of
|
| 50 |
-
"
|
| 51 |
)
|
| 52 |
|
| 53 |
|
|
@@ -57,22 +58,25 @@ st.caption(
|
|
| 57 |
|
| 58 |
|
| 59 |
input_choice = st.radio(
|
| 60 |
-
label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
|
| 61 |
)
|
| 62 |
|
| 63 |
with st.form(key="settings_form"):
|
| 64 |
split_by_line = st.checkbox(label="Split by newline", value=True)
|
| 65 |
|
| 66 |
-
|
| 67 |
|
| 68 |
-
if input_choice == "Upload file":
|
| 69 |
-
|
| 70 |
-
label="Choose a .txt file", type=["txt"], accept_multiple_files=
|
| 71 |
)
|
| 72 |
|
| 73 |
-
if
|
| 74 |
# To convert to a string based IO:
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
else:
|
| 78 |
default_text = """Hello, morning dew. The grass whispers low.
|
|
@@ -81,9 +85,11 @@ Good morning, world. The birds sing in delight.
|
|
| 81 |
Let's spread our wings. The butterflies take flight.
|
| 82 |
Nature's chorus sings, a symphony of light."""
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Row of selectors
|
| 89 |
col1, col2 = st.columns([1, 1])
|
|
@@ -132,30 +138,26 @@ Nature's chorus sings, a symphony of light."""
|
|
| 132 |
#############
|
| 133 |
|
| 134 |
|
| 135 |
-
if apply_settings_button and
|
| 136 |
if model_size_pretty not in available_model_size_options(lang=language_short):
|
| 137 |
st.write(
|
| 138 |
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
| 139 |
)
|
| 140 |
else:
|
| 141 |
-
#
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
text=string_data,
|
| 156 |
-
lang=language_short,
|
| 157 |
-
spacy_model_size=model_size_short,
|
| 158 |
-
metrics=metrics,
|
| 159 |
)
|
| 160 |
|
| 161 |
###################
|
|
@@ -165,13 +167,15 @@ if apply_settings_button and string_data is not None and string_data:
|
|
| 165 |
# Create 2 columns with 1) the output header
|
| 166 |
# and 2) a download button
|
| 167 |
DataViewer()._header_and_download(
|
| 168 |
-
header="The calculated metrics",
|
|
|
|
|
|
|
| 169 |
)
|
| 170 |
|
| 171 |
st.write("**Note**: This data frame has been transposed for readability.")
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
st.dataframe(data=
|
| 175 |
|
| 176 |
|
| 177 |
############################
|
|
@@ -182,6 +186,10 @@ if apply_settings_button and string_data is not None and string_data:
|
|
| 182 |
with st.expander("See python code"):
|
| 183 |
st.code(
|
| 184 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
import textdescriptives as td
|
| 186 |
|
| 187 |
# Given a string of text and the settings
|
|
|
|
| 5 |
|
| 6 |
from io import StringIO
|
| 7 |
|
| 8 |
+
import pandas as pd
|
| 9 |
import streamlit as st
|
| 10 |
import textdescriptives as td
|
| 11 |
|
| 12 |
from data_viewer import DataViewer
|
| 13 |
+
from process_text import text_to_metrics
|
| 14 |
from options import (
|
| 15 |
all_model_size_options_pretty_to_short,
|
| 16 |
available_model_size_options,
|
|
|
|
| 29 |
with col2:
|
| 30 |
st.image(
|
| 31 |
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
|
| 32 |
+
width=125,
|
| 33 |
)
|
| 34 |
|
| 35 |
st.write(
|
|
|
|
| 47 |
|
| 48 |
st.caption(
|
| 49 |
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
| 50 |
+
"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
|
| 51 |
+
"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
|
| 52 |
)
|
| 53 |
|
| 54 |
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
input_choice = st.radio(
|
| 61 |
+
label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
|
| 62 |
)
|
| 63 |
|
| 64 |
with st.form(key="settings_form"):
|
| 65 |
split_by_line = st.checkbox(label="Split by newline", value=True)
|
| 66 |
|
| 67 |
+
file_name_to_text_string = {}
|
| 68 |
|
| 69 |
+
if input_choice == "Upload file(s)":
|
| 70 |
+
uploaded_files = st.file_uploader(
|
| 71 |
+
label="Choose a .txt file", type=["txt"], accept_multiple_files=True
|
| 72 |
)
|
| 73 |
|
| 74 |
+
if uploaded_files is not None and len(uploaded_files) > 0:
|
| 75 |
# To convert to a string based IO:
|
| 76 |
+
file_name_to_text_string = {
|
| 77 |
+
file.name: StringIO(file.getvalue().decode("utf-8")).read()
|
| 78 |
+
for file in uploaded_files
|
| 79 |
+
}
|
| 80 |
|
| 81 |
else:
|
| 82 |
default_text = """Hello, morning dew. The grass whispers low.
|
|
|
|
| 85 |
Let's spread our wings. The butterflies take flight.
|
| 86 |
Nature's chorus sings, a symphony of light."""
|
| 87 |
|
| 88 |
+
file_name_to_text_string = {
|
| 89 |
+
"input": st.text_area(
|
| 90 |
+
label="Enter text", value=default_text, height=145, max_chars=None
|
| 91 |
+
)
|
| 92 |
+
}
|
| 93 |
|
| 94 |
# Row of selectors
|
| 95 |
col1, col2 = st.columns([1, 1])
|
|
|
|
| 138 |
#############
|
| 139 |
|
| 140 |
|
| 141 |
+
if apply_settings_button and len(file_name_to_text_string) > 0:
|
| 142 |
if model_size_pretty not in available_model_size_options(lang=language_short):
|
| 143 |
st.write(
|
| 144 |
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
| 145 |
)
|
| 146 |
else:
|
| 147 |
+
# Extract metrics for each text
|
| 148 |
+
output_df = pd.concat(
|
| 149 |
+
[
|
| 150 |
+
text_to_metrics(
|
| 151 |
+
string=string,
|
| 152 |
+
language_short=language_short,
|
| 153 |
+
model_size_short=model_size_short,
|
| 154 |
+
metrics=metrics,
|
| 155 |
+
split_by_line=split_by_line,
|
| 156 |
+
filename=filename if "Upload" in input_choice else None,
|
| 157 |
+
)
|
| 158 |
+
for filename, string in file_name_to_text_string.items()
|
| 159 |
+
],
|
| 160 |
+
ignore_index=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
###################
|
|
|
|
| 167 |
# Create 2 columns with 1) the output header
|
| 168 |
# and 2) a download button
|
| 169 |
DataViewer()._header_and_download(
|
| 170 |
+
header="The calculated metrics",
|
| 171 |
+
data=output_df,
|
| 172 |
+
file_name="text_metrics.csv",
|
| 173 |
)
|
| 174 |
|
| 175 |
st.write("**Note**: This data frame has been transposed for readability.")
|
| 176 |
+
output_df = output_df.transpose().reset_index()
|
| 177 |
+
output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
|
| 178 |
+
st.dataframe(data=output_df, use_container_width=True)
|
| 179 |
|
| 180 |
|
| 181 |
############################
|
|
|
|
| 186 |
with st.expander("See python code"):
|
| 187 |
st.code(
|
| 188 |
"""
|
| 189 |
+
# Note: This is the code for a single text file
|
| 190 |
+
# The actual code is slightly more complex
|
| 191 |
+
# to allow processing multiple files at once
|
| 192 |
+
|
| 193 |
import textdescriptives as td
|
| 194 |
|
| 195 |
# Given a string of text and the settings
|
data_viewer.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
import streamlit as st
|
| 3 |
|
| 4 |
|
| 5 |
class DataViewer:
|
| 6 |
-
|
| 7 |
-
# @st.cache_data
|
| 8 |
def _convert_df_to_csv(self, data, **kwargs):
|
| 9 |
-
return data.to_csv(**kwargs).encode(
|
| 10 |
|
| 11 |
-
def _header_and_download(
|
|
|
|
|
|
|
| 12 |
col1, col2 = st.columns([9, 2])
|
| 13 |
with col1:
|
| 14 |
st.subheader(header)
|
|
@@ -16,8 +19,8 @@ class DataViewer:
|
|
| 16 |
st.write("")
|
| 17 |
st.download_button(
|
| 18 |
label=label,
|
| 19 |
-
data=self._convert_df_to_csv(data),
|
| 20 |
file_name=file_name,
|
| 21 |
key=key,
|
| 22 |
-
help=help
|
| 23 |
)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Class for showing header and download button in the same row.
|
| 3 |
+
"""
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
|
| 7 |
|
| 8 |
class DataViewer:
|
|
|
|
|
|
|
| 9 |
def _convert_df_to_csv(self, data, **kwargs):
|
| 10 |
+
return data.to_csv(**kwargs).encode("utf-8")
|
| 11 |
|
| 12 |
+
def _header_and_download(
|
| 13 |
+
self, header, data, file_name, key=None, label="Download", help="Download data"
|
| 14 |
+
):
|
| 15 |
col1, col2 = st.columns([9, 2])
|
| 16 |
with col1:
|
| 17 |
st.subheader(header)
|
|
|
|
| 19 |
st.write("")
|
| 20 |
st.download_button(
|
| 21 |
label=label,
|
| 22 |
+
data=self._convert_df_to_csv(data, index=False),
|
| 23 |
file_name=file_name,
|
| 24 |
key=key,
|
| 25 |
+
help=help,
|
| 26 |
)
|
process_text.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
The text processing functionality.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import textdescriptives as td
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@st.cache_data
|
| 12 |
+
def text_to_metrics(
|
| 13 |
+
string: str,
|
| 14 |
+
language_short: str,
|
| 15 |
+
model_size_short: str,
|
| 16 |
+
metrics: List[str],
|
| 17 |
+
split_by_line: bool,
|
| 18 |
+
filename: Optional[str],
|
| 19 |
+
) -> pd.DataFrame:
|
| 20 |
+
# Clean and (optionally) split the text
|
| 21 |
+
string = string.strip()
|
| 22 |
+
if split_by_line:
|
| 23 |
+
strings = string.split("\n")
|
| 24 |
+
else:
|
| 25 |
+
strings = [string]
|
| 26 |
+
|
| 27 |
+
# Remove empty strings
|
| 28 |
+
# E.g. due to consecutive newlines
|
| 29 |
+
strings = [s for s in strings if s]
|
| 30 |
+
|
| 31 |
+
# Will automatically download the relevant model and extract all metrics
|
| 32 |
+
# TODO: Download beforehand to speed up inference
|
| 33 |
+
df = td.extract_metrics(
|
| 34 |
+
text=strings,
|
| 35 |
+
lang=language_short,
|
| 36 |
+
spacy_model_size=model_size_short,
|
| 37 |
+
metrics=metrics,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Add filename
|
| 41 |
+
if filename is not None:
|
| 42 |
+
df["File"] = filename
|
| 43 |
+
move_column_inplace(df=df, col="File", pos=0)
|
| 44 |
+
|
| 45 |
+
return df
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
|
| 49 |
+
"""
|
| 50 |
+
Move a column to a given column-index position.
|
| 51 |
+
|
| 52 |
+
Taken from the `utipy` package.
|
| 53 |
+
|
| 54 |
+
Parameters
|
| 55 |
+
----------
|
| 56 |
+
df : `pandas.DataFrame`.
|
| 57 |
+
col : str
|
| 58 |
+
Name of column to move.
|
| 59 |
+
pos : int
|
| 60 |
+
Column index to move `col` to.
|
| 61 |
+
"""
|
| 62 |
+
assert (
|
| 63 |
+
0 <= pos < len(df.columns)
|
| 64 |
+
), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
|
| 65 |
+
col = df.pop(col)
|
| 66 |
+
df.insert(pos, col.name, col)
|