Spaces:
Sleeping
Sleeping
Commit
·
9ec1981
1
Parent(s):
0e8d848
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import requests
|
| 7 |
+
from urllib.parse import urlparse, quote
|
| 8 |
+
import re
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
import time
|
| 11 |
+
from joblib import Parallel, delayed
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@st.cache_data
|
| 15 |
+
def convert_df(df):
|
| 16 |
+
return df.to_csv()
|
| 17 |
+
|
| 18 |
+
def extract_website_domain(url):
|
| 19 |
+
parsed_url = urlparse(url)
|
| 20 |
+
return parsed_url.netloc
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def google_address(address):
|
| 24 |
+
|
| 25 |
+
address_number = re.findall(r'\b\d+\b', address)[0]
|
| 26 |
+
|
| 27 |
+
search_query = quote(address)
|
| 28 |
+
url=f'https://www.google.com/search?q={search_query}'
|
| 29 |
+
response = requests.get(url)
|
| 30 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 31 |
+
|
| 32 |
+
texts_links = []
|
| 33 |
+
for link in soup.find_all("a"):
|
| 34 |
+
t,l=link.get_text(), link.get("href")
|
| 35 |
+
if (l[:11]=='/url?q=http') and (len(t)>20 ):
|
| 36 |
+
texts_links.append((t,l))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
text = soup.get_text()
|
| 40 |
+
|
| 41 |
+
texts_links_des=[]
|
| 42 |
+
for i,t_l in enumerate(texts_links):
|
| 43 |
+
start=text.find(texts_links[i][0][:50])
|
| 44 |
+
try:
|
| 45 |
+
end=text.find(texts_links[i+1][0][:50])
|
| 46 |
+
except:
|
| 47 |
+
end=text.find('Related searches')
|
| 48 |
+
|
| 49 |
+
description=text[start:end]
|
| 50 |
+
texts_links_des.append((t_l[0],t_l[1],description))
|
| 51 |
+
|
| 52 |
+
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
| 53 |
+
df['Description']=df['Description'].bfill()
|
| 54 |
+
df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
|
| 55 |
+
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
| 56 |
+
df['Website'] = df['Link'].apply(extract_website_domain)
|
| 57 |
+
|
| 58 |
+
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.)")[0]
|
| 59 |
+
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
|
| 60 |
+
|
| 61 |
+
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
|
| 62 |
+
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bath")
|
| 63 |
+
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
| 64 |
+
|
| 65 |
+
df_final=df[df['Address'].notnull()]
|
| 66 |
+
df_final=df_final[df_final['Address'].str.contains(str(address_number))]
|
| 67 |
+
return df_final
|
| 68 |
+
|
| 69 |
+
def process_multiple_address(addresses):
|
| 70 |
+
results=Parallel(n_jobs=-1, prefer="threads")(delayed(google_address)(i) for i in addresses)
|
| 71 |
+
return results
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
st.set_page_config(layout="wide")
|
| 75 |
+
# col1, col2 = st.columns((2))
|
| 76 |
+
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
| 77 |
+
|
| 78 |
+
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
| 79 |
+
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
| 80 |
+
# uploaded_file='C:/Users/mritchey/Documents/addresses 100 generated.xlsx'
|
| 81 |
+
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if address_file == 'File' and not None:
|
| 85 |
+
try:
|
| 86 |
+
df = pd.read_csv(uploaded_file)
|
| 87 |
+
except:
|
| 88 |
+
df = pd.read_excel(uploaded_file)
|
| 89 |
+
|
| 90 |
+
address_cols=list(df.columns[:4])
|
| 91 |
+
df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
|
| 92 |
+
df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5))
|
| 93 |
+
|
| 94 |
+
df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
|
| 95 |
+
|
| 96 |
+
results= process_multiple_address(df['Address All'].values,trial=True)
|
| 97 |
+
results=pd.concat(results).reset_index(drop=1)
|
| 98 |
+
results.index=results.index+1
|
| 99 |
+
|
| 100 |
+
else:
|
| 101 |
+
results=google_address(address).reset_index(drop=1)
|
| 102 |
+
results.index=results.index+1
|
| 103 |
+
results.insert(0,'Address Input',address)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
| 107 |
+
'Link', 'Description',
|
| 108 |
+
]]
|
| 109 |
+
|
| 110 |
+
if return_sq=='Yes':
|
| 111 |
+
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
| 112 |
+
results.index=results.index+1
|
| 113 |
+
|
| 114 |
+
st.dataframe(
|
| 115 |
+
results,
|
| 116 |
+
column_config={
|
| 117 |
+
|
| 118 |
+
"Link": st.column_config.LinkColumn("Link"),
|
| 119 |
+
},
|
| 120 |
+
hide_index=True,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
csv2 = convert_df(results)
|
| 124 |
+
st.download_button(
|
| 125 |
+
label="Download data as CSV",
|
| 126 |
+
data=csv2,
|
| 127 |
+
file_name=f'{address}.csv',
|
| 128 |
+
mime='text/csv')
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
st.markdown(""" <style>
|
| 132 |
+
#MainMenu {visibility: hidden;}
|
| 133 |
+
footer {visibility: hidden;}
|
| 134 |
+
</style> """, unsafe_allow_html=True)
|