Spaces:
Sleeping
Sleeping
| import random | |
| import re | |
| from pprint import pprint | |
| from urllib.request import Request, urlopen | |
| from html.parser import HTMLParser | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import quote_plus, parse_qs | |
| import requests, lxml, json | |
| import logging | |
| import pdb | |
| def USER_AGENT(): | |
| uastrings = [ | |
| 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', | |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9', | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', | |
| 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', | |
| ] | |
| ua = random.choice(uastrings) | |
| logging.info(f'Using user agent: {ua}') | |
| return ua | |
| def brave_search(query: str, pages=1) -> list: | |
| params = { | |
| 'q': quote_plus(query), | |
| 'source': 'web', | |
| 'tf': 'at', | |
| } | |
| headers = { | |
| 'User-Agent': USER_AGENT() | |
| } | |
| urls = [] | |
| for page in range(pages): | |
| params['offset'] = page | |
| html = requests.get('https://search.brave.com/search', headers=headers, params=params) | |
| soup = BeautifulSoup(html.text, 'lxml') | |
| for result in soup.select('.snippet'): | |
| link = result.select_one('.result-header').get('href') | |
| """ | |
| sitelinks_container = result.select('.deep-results-buttons .deep-link') | |
| sitelinks = None | |
| if sitelinks_container: | |
| sitelinks = [] | |
| for sitelink in sitelinks_container: | |
| sitelinks.append({ | |
| 'title': sitelink.get_text().strip(), | |
| 'link': sitelink.get('href') | |
| }) | |
| """ | |
| urls.append(link) | |
| return urls | |
| def bing_search(query: str, pages=1) -> list: | |
| """ | |
| Gets web results from Bing | |
| :param query: query to search | |
| :param pages_number: number of search pages to scrape | |
| :return: a list of links in ranked order | |
| """ | |
| urls = [] | |
| for page in range(pages): | |
| first = page * 10 + 1 | |
| address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first) | |
| data = get_html(address) | |
| soup = BeautifulSoup(data, 'lxml') | |
| links = soup.findAll('li', {'class': 'b_algo'}) | |
| urls.extend([link.find('h2').find('a')['href'] for link in links]) | |
| return urls | |
| def duckduckgo_search(query: str, pages=1): | |
| """ | |
| NOT WORKING; LIKELY BLOCKED | |
| """ | |
| urls = [] | |
| start_index = 0 | |
| for page in range(pages): | |
| address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index) | |
| data = get_html(address) | |
| soup = BeautifulSoup(data, 'lxml') | |
| links = soup.findAll('a', {'class': 'result__a'}) | |
| urls.extend([link['href'] for link in links]) | |
| start_index = len(urls) | |
| try: | |
| urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls] | |
| except: | |
| logging.warn(f'Parsing failed for {len(urls)} urls') | |
| return urls | |
| def news_search(query: str, pages=1): | |
| urls = [] | |
| for page in range(pages): | |
| api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}' | |
| data = get_html(api_url) | |
| soup = BeautifulSoup(data, 'lxml') | |
| links = soup.findAll('a', {'class': 'title'}) | |
| urls.extend([link['href'] for link in links]) | |
| return urls | |
| def get_html(url: str) -> str: | |
| """ | |
| Downloads the html source code of a webpage | |
| :param url: | |
| :return: html source code | |
| """ | |
| try: | |
| headers = { | |
| 'User-Agent': USER_AGENT() | |
| } | |
| req = Request(url, headers=headers) | |
| page = urlopen(req, timeout=3) | |
| return str(page.read()) | |
| except: | |
| return '' | |
| class WebParser(HTMLParser): | |
| """ | |
| A class for converting the tagged html to formats that can be used by a ML model | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| self.block_tags = { | |
| 'div', 'p' | |
| } | |
| self.inline_tags = { | |
| '', 'a', 'b', 'tr', 'main', 'span', 'time', 'td', | |
| 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br' | |
| } | |
| self.allowed_tags = self.block_tags.union(self.inline_tags) | |
| self.opened_tags = [] | |
| self.block_content = '' | |
| self.blocks = [] | |
| def get_last_opened_tag(self): | |
| """ | |
| Gets the last visited tag | |
| :return: | |
| """ | |
| if len(self.opened_tags) > 0: | |
| return self.opened_tags[len(self.opened_tags) - 1] | |
| return '' | |
| def error(self, message): | |
| pass | |
| def handle_starttag(self, tag, attrs): | |
| """ | |
| Handles the start tag of an HTML node in the tree | |
| :param tag: the HTML tag | |
| :param attrs: the tag attributes | |
| :return: | |
| """ | |
| self.opened_tags.append(tag) | |
| if tag in self.block_tags: | |
| self.block_content = self.block_content.strip() | |
| if len(self.block_content) > 0: | |
| if not self.block_content.endswith('.'): | |
| self.block_content += '.' | |
| self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ') | |
| self.block_content = re.sub("\s\s+", " ", self.block_content) | |
| self.blocks.append(self.block_content) | |
| self.block_content = '' | |
| def handle_endtag(self, tag): | |
| """ | |
| Handles the end tag of an HTML node in the tree | |
| :param tag: the HTML tag | |
| :return: | |
| """ | |
| if len(self.opened_tags) > 0: | |
| self.opened_tags.pop() | |
| def handle_data(self, data): | |
| """ | |
| Handles a text HTML node in the tree | |
| :param data: the text node | |
| :return: | |
| """ | |
| last_opened_tag = self.get_last_opened_tag() | |
| if last_opened_tag in self.allowed_tags: | |
| data = data.replace(' ', ' ').strip() | |
| if data != '': | |
| self.block_content += data + ' ' | |
| def get_text(self): | |
| return "\n\n".join(self.blocks) | |