Spaces:

anabmaulana
/

quin-general

Sleeping

App Files Files Community

quin-general / web_search.py

anabmaulana

update

228bbe9 2 months ago

raw

history blame

6.3 kB

	import random
	import re

	from pprint import pprint
	from urllib.request import Request, urlopen
	from html.parser import HTMLParser
	from bs4 import BeautifulSoup
	from urllib.parse import quote_plus, parse_qs

	import requests, lxml, json
	import logging

	import pdb

	def USER_AGENT():
	uastrings = [
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
	'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
	]
	ua = random.choice(uastrings)
	logging.info(f'Using user agent: {ua}')
	return ua


	def brave_search(query: str, pages=1) -> list:
	params = {
	'q': quote_plus(query),
	'source': 'web',
	'tf': 'at',
	}
	headers = {
	'User-Agent': USER_AGENT()
	}

	urls = []
	for page in range(pages):
	params['offset'] = page
	html = requests.get('https://search.brave.com/search', headers=headers, params=params)
	soup = BeautifulSoup(html.text, 'lxml')

	for result in soup.select('.snippet'):
	link = result.select_one('.result-header').get('href')
	"""
	sitelinks_container = result.select('.deep-results-buttons .deep-link')
	sitelinks = None

	if sitelinks_container:
	sitelinks = []
	for sitelink in sitelinks_container:
	sitelinks.append({
	'title': sitelink.get_text().strip(),
	'link': sitelink.get('href')
	})
	"""
	urls.append(link)
	return urls


	def bing_search(query: str, pages=1) -> list:
	"""
	Gets web results from Bing
	:param query: query to search
	:param pages_number: number of search pages to scrape
	:return: a list of links in ranked order
	"""
	urls = []
	for page in range(pages):
	first = page * 10 + 1
	address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
	data = get_html(address)
	soup = BeautifulSoup(data, 'lxml')
	links = soup.findAll('li', {'class': 'b_algo'})
	urls.extend([link.find('h2').find('a')['href'] for link in links])

	return urls


	def duckduckgo_search(query: str, pages=1):
	"""
	NOT WORKING; LIKELY BLOCKED
	"""
	urls = []
	start_index = 0
	for page in range(pages):
	address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index)
	data = get_html(address)
	soup = BeautifulSoup(data, 'lxml')
	links = soup.findAll('a', {'class': 'result__a'})
	urls.extend([link['href'] for link in links])
	start_index = len(urls)
	try:
	urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
	except:
	logging.warn(f'Parsing failed for {len(urls)} urls')
	return urls


	def news_search(query: str, pages=1):
	urls = []
	for page in range(pages):
	api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}'
	data = get_html(api_url)
	soup = BeautifulSoup(data, 'lxml')
	links = soup.findAll('a', {'class': 'title'})
	urls.extend([link['href'] for link in links])
	return urls


	def get_html(url: str) -> str:
	"""
	Downloads the html source code of a webpage
	:param url:
	:return: html source code
	"""
	try:
	headers = {
	'User-Agent': USER_AGENT()
	}
	req = Request(url, headers=headers)
	page = urlopen(req, timeout=3)
	return str(page.read())
	except:
	return ''


	class WebParser(HTMLParser):
	"""
	A class for converting the tagged html to formats that can be used by a ML model
	"""

	def __init__(self):
	super().__init__()
	self.block_tags = {
	'div', 'p'
	}
	self.inline_tags = {
	'', 'a', 'b', 'tr', 'main', 'span', 'time', 'td',
	'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br'
	}
	self.allowed_tags = self.block_tags.union(self.inline_tags)
	self.opened_tags = []
	self.block_content = ''
	self.blocks = []

	def get_last_opened_tag(self):
	"""
	Gets the last visited tag
	:return:
	"""
	if len(self.opened_tags) > 0:
	return self.opened_tags[len(self.opened_tags) - 1]
	return ''

	def error(self, message):
	pass

	def handle_starttag(self, tag, attrs):
	"""
	Handles the start tag of an HTML node in the tree
	:param tag: the HTML tag
	:param attrs: the tag attributes
	:return:
	"""
	self.opened_tags.append(tag)
	if tag in self.block_tags:
	self.block_content = self.block_content.strip()
	if len(self.block_content) > 0:
	if not self.block_content.endswith('.'):
	self.block_content += '.'
	self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ')
	self.block_content = re.sub("\s\s+", " ", self.block_content)
	self.blocks.append(self.block_content)
	self.block_content = ''

	def handle_endtag(self, tag):
	"""
	Handles the end tag of an HTML node in the tree
	:param tag: the HTML tag
	:return:
	"""
	if len(self.opened_tags) > 0:
	self.opened_tags.pop()

	def handle_data(self, data):
	"""
	Handles a text HTML node in the tree
	:param data: the text node
	:return:
	"""
	last_opened_tag = self.get_last_opened_tag()
	if last_opened_tag in self.allowed_tags:
	data = data.replace(' ', ' ').strip()
	if data != '':
	self.block_content += data + ' '

	def get_text(self):
	return "\n\n".join(self.blocks)