File size: 6,301 Bytes
228bbe9
592f71e
 
 
 
 
 
228bbe9
592f71e
228bbe9
 
592f71e
228bbe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592f71e
 
 
 
 
 
 
228bbe9
592f71e
 
 
 
 
 
 
 
 
 
 
228bbe9
 
 
592f71e
 
 
 
 
 
 
 
 
228bbe9
 
 
 
592f71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228bbe9
 
 
 
592f71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import random
import re

from pprint import pprint
from urllib.request import Request, urlopen
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from urllib.parse import quote_plus, parse_qs

import requests, lxml, json
import logging

import pdb

def USER_AGENT():
    uastrings = [
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
    ]
    ua = random.choice(uastrings)
    logging.info(f'Using user agent: {ua}')
    return ua


def brave_search(query: str, pages=1) -> list:
    params = {
        'q': quote_plus(query),
        'source': 'web',
        'tf': 'at',
    }
    headers = {
        'User-Agent': USER_AGENT()
    }

    urls = []
    for page in range(pages):
        params['offset'] = page
        html = requests.get('https://search.brave.com/search', headers=headers, params=params)
        soup = BeautifulSoup(html.text, 'lxml')

        for result in soup.select('.snippet'):
            link = result.select_one('.result-header').get('href')
            """
            sitelinks_container = result.select('.deep-results-buttons .deep-link')
            sitelinks = None

            if sitelinks_container:
                sitelinks = []
                for sitelink in sitelinks_container:
                    sitelinks.append({
                        'title': sitelink.get_text().strip(),
                        'link': sitelink.get('href')
                    })
            """
            urls.append(link)
    return urls


def bing_search(query: str, pages=1) -> list:
    """
    Gets web results from Bing
    :param query: query to search
    :param pages_number: number of search pages to scrape
    :return: a list of links in ranked order
    """
    urls = []
    for page in range(pages):
        first = page * 10 + 1
        address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
        data = get_html(address)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('li', {'class': 'b_algo'})
        urls.extend([link.find('h2').find('a')['href'] for link in links])

    return urls


def duckduckgo_search(query: str, pages=1):
    """
    NOT WORKING; LIKELY BLOCKED
    """
    urls = []
    start_index = 0
    for page in range(pages):
        address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index)
        data = get_html(address)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('a', {'class': 'result__a'})
        urls.extend([link['href'] for link in links])
        start_index = len(urls)
    try:
        urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
    except:
        logging.warn(f'Parsing failed for {len(urls)} urls')
    return urls


def news_search(query: str, pages=1):
    urls = []
    for page in range(pages):
        api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}'
        data = get_html(api_url)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('a', {'class': 'title'})
        urls.extend([link['href'] for link in links])
    return urls


def get_html(url: str) -> str:
    """
    Downloads the html source code of a webpage
    :param url:
    :return: html source code
    """
    try:
        headers = {
            'User-Agent': USER_AGENT()
        }
        req = Request(url, headers=headers)
        page = urlopen(req, timeout=3)
        return str(page.read())
    except:
        return ''


class WebParser(HTMLParser):
    """
    A class for converting the tagged html to formats that can be used by a ML model
    """

    def __init__(self):
        super().__init__()
        self.block_tags = {
            'div', 'p'
        }
        self.inline_tags = {
            '', 'a', 'b', 'tr', 'main', 'span', 'time', 'td',
            'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br'
        }
        self.allowed_tags = self.block_tags.union(self.inline_tags)
        self.opened_tags = []
        self.block_content = ''
        self.blocks = []

    def get_last_opened_tag(self):
        """
        Gets the last visited tag
        :return:
        """
        if len(self.opened_tags) > 0:
            return self.opened_tags[len(self.opened_tags) - 1]
        return ''

    def error(self, message):
        pass

    def handle_starttag(self, tag, attrs):
        """
        Handles the start tag of an HTML node in the tree
        :param tag: the HTML tag
        :param attrs: the tag attributes
        :return:
        """
        self.opened_tags.append(tag)
        if tag in self.block_tags:
            self.block_content = self.block_content.strip()
            if len(self.block_content) > 0:
                if not self.block_content.endswith('.'):
                    self.block_content += '.'
                self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ')
                self.block_content = re.sub("\s\s+", " ", self.block_content)
                self.blocks.append(self.block_content)
            self.block_content = ''

    def handle_endtag(self, tag):
        """
        Handles the end tag of an HTML node in the tree
        :param tag: the HTML tag
        :return:
        """
        if len(self.opened_tags) > 0:
            self.opened_tags.pop()

    def handle_data(self, data):
        """
        Handles a text HTML node in the tree
        :param data: the text node
        :return:
        """
        last_opened_tag = self.get_last_opened_tag()
        if last_opened_tag in self.allowed_tags:
            data = data.replace('  ', ' ').strip()
            if data != '':
                self.block_content += data + ' '

    def get_text(self):
        return "\n\n".join(self.blocks)