Spaces:

anabmaulana
/

quin-general

Sleeping

File size: 6,301 Bytes

import random
import re

from pprint import pprint
from urllib.request import Request, urlopen
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from urllib.parse import quote_plus, parse_qs

import requests, lxml, json
import logging

import pdb

def USER_AGENT():
    uastrings = [
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
    ]
    ua = random.choice(uastrings)
    logging.info(f'Using user agent: {ua}')
    return ua


def brave_search(query: str, pages=1) -> list:
    params = {
        'q': quote_plus(query),
        'source': 'web',
        'tf': 'at',
    }
    headers = {
        'User-Agent': USER_AGENT()
    }

    urls = []
    for page in range(pages):
        params['offset'] = page
        html = requests.get('https://search.brave.com/search', headers=headers, params=params)
        soup = BeautifulSoup(html.text, 'lxml')

        for result in soup.select('.snippet'):
            link = result.select_one('.result-header').get('href')
            """
            sitelinks_container = result.select('.deep-results-buttons .deep-link')
            sitelinks = None

            if sitelinks_container:
                sitelinks = []
                for sitelink in sitelinks_container:
                    sitelinks.append({
                        'title': sitelink.get_text().strip(),
                        'link': sitelink.get('href')
                    })
            """
            urls.append(link)
    return urls


def bing_search(query: str, pages=1) -> list:
    """
    Gets web results from Bing
    :param query: query to search
    :param pages_number: number of search pages to scrape
    :return: a list of links in ranked order
    """
    urls = []
    for page in range(pages):
        first = page * 10 + 1
        address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
        data = get_html(address)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('li', {'class': 'b_algo'})
        urls.extend([link.find('h2').find('a')['href'] for link in links])

    return urls


def duckduckgo_search(query: str, pages=1):
    """
    NOT WORKING; LIKELY BLOCKED
    """
    urls = []
    start_index = 0
    for page in range(pages):
        address = "https://duckduckgo.com/html/?kl=en-us&q={}&s={}".format(quote_plus(query), start_index)
        data = get_html(address)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('a', {'class': 'result__a'})
        urls.extend([link['href'] for link in links])
        start_index = len(urls)
    try:
        urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
    except:
        logging.warn(f'Parsing failed for {len(urls)} urls')
    return urls


def news_search(query: str, pages=1):
    urls = []
    for page in range(pages):
        api_url = f'https://newslookup.com/results?l=2&q={quote_plus(query)}&dp=&mt=-1&mkt=0&mtx=0&mktx=0&s=&groupby=no&cat=-1&from=&fmt=&tp=720&ps=50&ovs=&page={page}'
        data = get_html(api_url)
        soup = BeautifulSoup(data, 'lxml')
        links = soup.findAll('a', {'class': 'title'})
        urls.extend([link['href'] for link in links])
    return urls


def get_html(url: str) -> str:
    """
    Downloads the html source code of a webpage
    :param url:
    :return: html source code
    """
    try:
        headers = {
            'User-Agent': USER_AGENT()
        }
        req = Request(url, headers=headers)
        page = urlopen(req, timeout=3)
        return str(page.read())
    except:
        return ''


class WebParser(HTMLParser):
    """
    A class for converting the tagged html to formats that can be used by a ML model
    """

    def __init__(self):
        super().__init__()
        self.block_tags = {
            'div', 'p'
        }
        self.inline_tags = {
            '', 'a', 'b', 'tr', 'main', 'span', 'time', 'td',
            'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'strong', 'br'
        }
        self.allowed_tags = self.block_tags.union(self.inline_tags)
        self.opened_tags = []
        self.block_content = ''
        self.blocks = []

    def get_last_opened_tag(self):
        """
        Gets the last visited tag
        :return:
        """
        if len(self.opened_tags) > 0:
            return self.opened_tags[len(self.opened_tags) - 1]
        return ''

    def error(self, message):
        pass

    def handle_starttag(self, tag, attrs):
        """
        Handles the start tag of an HTML node in the tree
        :param tag: the HTML tag
        :param attrs: the tag attributes
        :return:
        """
        self.opened_tags.append(tag)
        if tag in self.block_tags:
            self.block_content = self.block_content.strip()
            if len(self.block_content) > 0:
                if not self.block_content.endswith('.'):
                    self.block_content += '.'
                self.block_content = self.block_content.replace('\\n', ' ').replace('\\r', ' ')
                self.block_content = re.sub("\s\s+", " ", self.block_content)
                self.blocks.append(self.block_content)
            self.block_content = ''

    def handle_endtag(self, tag):
        """
        Handles the end tag of an HTML node in the tree
        :param tag: the HTML tag
        :return:
        """
        if len(self.opened_tags) > 0:
            self.opened_tags.pop()

    def handle_data(self, data):
        """
        Handles a text HTML node in the tree
        :param data: the text node
        :return:
        """
        last_opened_tag = self.get_last_opened_tag()
        if last_opened_tag in self.allowed_tags:
            data = data.replace('  ', ' ').strip()
            if data != '':
                self.block_content += data + ' '

    def get_text(self):
        return "\n\n".join(self.blocks)