Spaces:
Sleeping
Sleeping
File size: 6,301 Bytes
228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e 228bbe9 592f71e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import random
import re
from pprint import pprint
from urllib.request import Request, urlopen
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from urllib.parse import quote_plus, parse_qs
import requests, lxml, json
import logging
import pdb
def USER_AGENT():
uastrings = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
]
ua = random.choice(uastrings)
logging.info(f'Using user agent: {ua}')
return ua
def brave_search(query: str, pages=1) -> list:
params = {
'q': quote_plus(query),
'source': 'web',
'tf': 'at',
}
headers = {
'User-Agent': USER_AGENT()
}
urls = []
for page in range(pages):
params['offset'] = page
html = requests.get('https://search.brave.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.snippet'):
link = result.select_one('.result-header').get('href')
"""
sitelinks_container = result.select('.deep-results-buttons .deep-link')
sitelinks = None
if sitelinks_container:
sitelinks = []
for sitelink in sitelinks_container:
sitelinks.append({
'title': sitelink.get_text().strip(),
'link': sitelink.get('href')
})
"""
urls.append(link)
return urls
def bing_search(query: str, pages=1) -> list:
"""
Gets web results from Bing
:param query: query to search
:param pages_number: number of search pages to scrape
:return: a list of links in ranked order
"""
urls = []
for page in range(pages):
first = page * 10 + 1
address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
data = get_html(address)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('li', {'class': 'b_algo'})
urls.extend([link.find('h2').find('a')['href'] for link in links])
return urls
def duckduckgo_search(query: str, pages=1):
"""
NOT WORKING; LIKELY BLOCKED
"""
urls = []
start_index = 0
for page in range(pages):
address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index)
data = get_html(address)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('a', {'class': 'result__a'})
urls.extend([link['href'] for link in links])
start_index = len(urls)
try:
urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
except:
logging.warn(f'Parsing failed for {len(urls)} urls')
return urls
def news_search(query: str, pages=1):
urls = []
for page in range(pages):
api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}'
data = get_html(api_url)
soup = BeautifulSoup(data, 'lxml')
links = soup.findAll('a', {'class': 'title'})
urls.extend([link['href'] for link in links])
return urls
def get_html(url: str) -> str:
"""
Downloads the html source code of a webpage
:param url:
:return: html source code
"""
try:
headers = {
'User-Agent': USER_AGENT()
}
req = Request(url, headers=headers)
page = urlopen(req, timeout=3)
return str(page.read())
except:
return ''
class WebParser(HTMLParser):
"""
A class for converting the tagged html to formats that can be used by a ML model
"""
def __init__(self):
super().__init__()
self.block_tags = {
'div', 'p'
}
self.inline_tags = {
'', 'a', 'b', 'tr', 'main', 'span', 'time', 'td',
'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br'
}
self.allowed_tags = self.block_tags.union(self.inline_tags)
self.opened_tags = []
self.block_content = ''
self.blocks = []
def get_last_opened_tag(self):
"""
Gets the last visited tag
:return:
"""
if len(self.opened_tags) > 0:
return self.opened_tags[len(self.opened_tags) - 1]
return ''
def error(self, message):
pass
def handle_starttag(self, tag, attrs):
"""
Handles the start tag of an HTML node in the tree
:param tag: the HTML tag
:param attrs: the tag attributes
:return:
"""
self.opened_tags.append(tag)
if tag in self.block_tags:
self.block_content = self.block_content.strip()
if len(self.block_content) > 0:
if not self.block_content.endswith('.'):
self.block_content += '.'
self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ')
self.block_content = re.sub("\s\s+", " ", self.block_content)
self.blocks.append(self.block_content)
self.block_content = ''
def handle_endtag(self, tag):
"""
Handles the end tag of an HTML node in the tree
:param tag: the HTML tag
:return:
"""
if len(self.opened_tags) > 0:
self.opened_tags.pop()
def handle_data(self, data):
"""
Handles a text HTML node in the tree
:param data: the text node
:return:
"""
last_opened_tag = self.get_last_opened_tag()
if last_opened_tag in self.allowed_tags:
data = data.replace(' ', ' ').strip()
if data != '':
self.block_content += data + ' '
def get_text(self):
return "\n\n".join(self.blocks)
|