Spaces:
Paused
Paused
import streamlit as st | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import time | |
# Instantiate global variables | |
df = pd.DataFrame(columns=["Title", "Location", "Company", "Link", "Description"]) | |
# Get user input | |
inputJobTitle = st.text_input("Enter Job Title:") | |
inputJobLocation = st.text_input("Enter Job Location:") | |
totalPages = st.number_input("Enter Total Pages:", min_value=1, value=1) | |
submit_button = st.button("Submit") | |
def scrapeJobDescription(url): | |
options = Options() | |
options.add_argument("--window-size=1920,1080") | |
options.add_argument("--headless=new") | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
driver = webdriver.Chrome(options=options) | |
driver.get(url) | |
html = driver.page_source | |
soup = BeautifulSoup(html, "html.parser") | |
try: | |
jobDescription = soup.find( | |
"div", class_="show-more-less-html__markup" | |
).text.strip() | |
return jobDescription | |
except: | |
return "" | |
def scrapeLinkedin(): | |
global df | |
global inputJobTitle | |
global inputJobLocation | |
counter = 0 | |
pageCounter = 1 | |
options = Options() | |
options.add_argument("--window-size=1920,1080") | |
options.add_argument("--headless=new") | |
driver = webdriver.Chrome(options=options) | |
while pageCounter <= totalPages: | |
try: | |
driver.get( | |
f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}" | |
) | |
html = driver.page_source | |
soup = BeautifulSoup(html, "html.parser") | |
ulElement = soup.find("ul", class_="jobs-search__results-list") | |
liElements = ulElement.find_all("li") | |
for item in liElements: | |
jobTitle = item.find( | |
"h3", class_="base-search-card__title" | |
).text.strip() | |
jobLocation = item.find( | |
"span", class_="job-search-card__location" | |
).text.strip() | |
jobCompany = item.find( | |
"h4", class_="base-search-card__subtitle" | |
).text.strip() | |
jobLink = item.find_all("a")[0]["href"] | |
jobDescription = scrapeJobDescription(jobLink) | |
if jobTitle and jobLocation and jobCompany and jobLink: | |
df = pd.concat( | |
[ | |
df, | |
pd.DataFrame( | |
{ | |
"Title": [jobTitle], | |
"Location": [jobLocation], | |
"Company": [jobCompany], | |
"Link": [jobLink], | |
"Description": [jobDescription], | |
} | |
), | |
] | |
) | |
counter += 25 | |
pageCounter += 1 | |
except: | |
break | |
driver.quit() | |
def convert_df(df): | |
return df.to_csv(index=False).encode('utf-8') | |
if submit_button: | |
with st.spinner("Operation in progress. Please wait..."): | |
scrapeLinkedin() | |
time.sleep(1) | |
st.write(df) | |
csv = convert_df(df) | |
st.download_button( | |
"Press to Download", | |
csv, | |
"file.csv", | |
"text/csv", | |
key='download-csv' | |
) | |