github-actions[bot] commited on
Commit
a4a7f53
·
1 Parent(s): fe77b4f

Sync with https://github.com/mozilla-ai/document-to-podcast

Browse files
Files changed (2) hide show
  1. app.py +10 -54
  2. requirements.txt +1 -0
app.py CHANGED
@@ -6,9 +6,6 @@ from pathlib import Path
6
  import numpy as np
7
  import soundfile as sf
8
  import streamlit as st
9
- import requests
10
- from bs4 import BeautifulSoup
11
- from requests.exceptions import RequestException
12
 
13
  from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
14
  from document_to_podcast.inference.model_loaders import (
@@ -55,7 +52,10 @@ uploaded_file = st.file_uploader(
55
  "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
56
  )
57
 
58
- if uploaded_file is not None:
 
 
 
59
  st.divider()
60
  st.header("Loading and Cleaning Data")
61
  st.markdown(
@@ -63,11 +63,15 @@ if uploaded_file is not None:
63
  )
64
  st.divider()
65
 
66
- extension = Path(uploaded_file.name).suffix
 
 
 
 
 
67
 
68
  col1, col2 = st.columns(2)
69
 
70
- raw_text = DATA_LOADERS[extension](uploaded_file)
71
  with col1:
72
  st.subheader("Raw Text")
73
  st.text_area(
@@ -86,53 +90,6 @@ if uploaded_file is not None:
86
 
87
  st.divider()
88
 
89
- st.header("Or Enter a Website URL")
90
- url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
91
- process_url = st.button("Clean URL Content")
92
-
93
-
94
- def process_url_content(url: str) -> tuple[str, str]:
95
- """Fetch and clean content from a URL.
96
-
97
- Args:
98
- url: The URL to fetch content from
99
-
100
- Returns:
101
- tuple containing raw and cleaned text
102
- """
103
- response = requests.get(url)
104
- response.raise_for_status()
105
- soup = BeautifulSoup(response.text, "html.parser")
106
- raw_text = soup.get_text()
107
- return raw_text, DATA_CLEANERS[".html"](raw_text)
108
-
109
-
110
- if url and process_url:
111
- try:
112
- with st.spinner("Fetching and cleaning content..."):
113
- raw_text, clean_text = process_url_content(url)
114
- st.session_state["clean_text"] = clean_text
115
-
116
- # Display results
117
- col1, col2 = st.columns(2)
118
- with col1:
119
- st.subheader("Raw Text")
120
- st.text_area(
121
- "Number of characters before cleaning: " f"{len(raw_text)}",
122
- f"{raw_text[:500]}...",
123
- )
124
- with col2:
125
- st.subheader("Cleaned Text")
126
- st.text_area(
127
- "Number of characters after cleaning: " f"{len(clean_text)}",
128
- f"{clean_text[:500]}...",
129
- )
130
- except RequestException as e:
131
- st.error(f"Error fetching URL: {str(e)}")
132
- except Exception as e:
133
- st.error(f"Error processing content: {str(e)}")
134
-
135
- # Second part - Podcast generation
136
  if "clean_text" in st.session_state:
137
  clean_text = st.session_state["clean_text"]
138
 
@@ -143,7 +100,6 @@ if "clean_text" in st.session_state:
143
  )
144
  st.divider()
145
 
146
- # Load models
147
  text_model = load_text_to_text_model()
148
  speech_model = load_text_to_speech_model()
149
 
 
6
  import numpy as np
7
  import soundfile as sf
8
  import streamlit as st
 
 
 
9
 
10
  from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
11
  from document_to_podcast.inference.model_loaders import (
 
52
  "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
53
  )
54
 
55
+ st.header("Or Enter a Website URL")
56
+ url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
57
+
58
+ if uploaded_file is not None or url:
59
  st.divider()
60
  st.header("Loading and Cleaning Data")
61
  st.markdown(
 
63
  )
64
  st.divider()
65
 
66
+ if uploaded_file:
67
+ extension = Path(uploaded_file.name).suffix
68
+ raw_text = DATA_LOADERS[extension](uploaded_file)
69
+ else:
70
+ extension = ".html"
71
+ raw_text = DATA_LOADERS["url"](url)
72
 
73
  col1, col2 = st.columns(2)
74
 
 
75
  with col1:
76
  st.subheader("Raw Text")
77
  st.text_area(
 
90
 
91
  st.divider()
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  if "clean_text" in st.session_state:
94
  clean_text = st.session_state["clean_text"]
95
 
 
100
  )
101
  st.divider()
102
 
 
103
  text_model = load_text_to_text_model()
104
  speech_model = load_text_to_speech_model()
105
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ document-to-podcast