quin-general / web_search.py
anabmaulana's picture
update
228bbe9
raw
history blame
6.3 kB
import random
import re
from pprint import pprint
from urllib.request import Request, urlopen
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from urllib.parse import quote_plus, parse_qs
import requests, lxml, json
import logging
import pdb
def USER_AGENT():
uastrings = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
]
ua = random.choice(uastrings)
logging.info(f'Using user agent: {ua}')
return ua
def brave_search(query: str, pages=1) -> list:
params = {
'q': quote_plus(query),
'source': 'web',
'tf': 'at',
}
headers = {
'User-Agent': USER_AGENT()
}
urls = []
for page in range(pages):
params['offset'] = page
html = requests.get('https://search.brave.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.snippet'):
link = result.select_one('.result-header').get('href')
"""
sitelinks_container = result.select('.deep-results-buttons .deep-link')
sitelinks = None
if sitelinks_container:
sitelinks = []
for sitelink in sitelinks_container:
sitelinks.append({
'title': sitelink.get_text().strip(),
'link': sitelink.get('href')
})
"""
urls.append(link)
return urls
def bing_search(query: str, pages=1) -> list:
"""
Gets web results from Bing
:param query: query to search
:param pages_number: number of search pages to scrape
:return: a list of links in ranked order
"""
urls = []
for page in range(pages):
first = page * 10 + 1
address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
data = get_html(address)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('li', {'class': 'b_algo'})
urls.extend([link.find('h2').find('a')['href'] for link in links])
return urls
def duckduckgo_search(query: str, pages=1):
"""
NOT WORKING; LIKELY BLOCKED
"""
urls = []
start_index = 0
for page in range(pages):
address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index)
data = get_html(address)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('a', {'class': 'result__a'})
urls.extend([link['href'] for link in links])
start_index = len(urls)
try:
urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
except:
logging.warn(f'Parsing failed for {len(urls)} urls')
return urls
def news_search(query: str, pages=1):
urls = []
for page in range(pages):
api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}'
data = get_html(api_url)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('a', {'class': 'title'})
urls.extend([link['href'] for link in links])
return urls
def get_html(url: str) -> str:
"""
Downloads the html source code of a webpage
:param url:
:return: html source code
"""
try:
headers = {
'User-Agent': USER_AGENT()
}
req = Request(url, headers=headers)
page = urlopen(req, timeout=3)
return str(page.read())
except:
return ''
class WebParser(HTMLParser):
"""
A class for converting the tagged html to formats that can be used by a ML model
"""
def __init__(self):
super().__init__()
self.block_tags = {
'div', 'p'
}
self.inline_tags = {
'', 'a', 'b', 'tr', 'main', 'span', 'time', 'td',
'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br'
}
self.allowed_tags = self.block_tags.union(self.inline_tags)
self.opened_tags = []
self.block_content = ''
self.blocks = []
def get_last_opened_tag(self):
"""
Gets the last visited tag
:return:
"""
if len(self.opened_tags) > 0:
return self.opened_tags[len(self.opened_tags) - 1]
return ''
def error(self, message):
pass
def handle_starttag(self, tag, attrs):
"""
Handles the start tag of an HTML node in the tree
:param tag: the HTML tag
:param attrs: the tag attributes
:return:
"""
self.opened_tags.append(tag)
if tag in self.block_tags:
self.block_content = self.block_content.strip()
if len(self.block_content) > 0:
if not self.block_content.endswith('.'):
self.block_content += '.'
self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ')
self.block_content = re.sub("\s\s+", " ", self.block_content)
self.blocks.append(self.block_content)
self.block_content = ''
def handle_endtag(self, tag):
"""
Handles the end tag of an HTML node in the tree
:param tag: the HTML tag
:return:
"""
if len(self.opened_tags) > 0:
self.opened_tags.pop()
def handle_data(self, data):
"""
Handles a text HTML node in the tree
:param data: the text node
:return:
"""
last_opened_tag = self.get_last_opened_tag()
if last_opened_tag in self.allowed_tags:
data = data.replace(' ', ' ').strip()
if data != '':
self.block_content += data + ' '
def get_text(self):
return "\n\n".join(self.blocks)