anabmaulana commited on
Commit
228bbe9
·
1 Parent(s): a484e0b
Files changed (2) hide show
  1. requirements.txt +137 -23
  2. web_search.py +65 -6
requirements.txt CHANGED
@@ -1,30 +1,144 @@
1
- tqdm==4.41.0
2
- transformers==3.0.2
3
- faiss_gpu==1.7.2
4
- syllables==0.1.0
5
- scipy==1.3.1
6
- nltk==3.2.5
7
- lxml==4.6.3
8
- requests==2.22.0
9
- Flask_Cors==3.0.8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  gpustat==0.6.0
 
 
 
 
11
  h5py==2.10.0
12
- packaging==20.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  pandas==1.0.3
14
- spacy==3.0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  sentence-transformers==0.3.0
16
- numpy==1.17.4
17
- matplotlib==3.3.1
18
- filelock==3.0.12
19
- boilerpipe3==1.1
20
- Unidecode==1.1.1
21
- Flask==2.0.0
22
- apex
23
- beautifulsoup4==4.9.3
24
- python_dateutil==2.8.1
25
- scikit_learn==0.24.1
 
26
  tantivy==0.13.2
27
- huggingface_hub==0.16.4
 
 
 
28
  torch==1.6.0+cu101
29
  torchvision==0.7.0+cu101
30
- gradio==3.34.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.8.6
3
+ aiosignal==1.3.1
4
+ altair==5.0.1
5
+ annotated-types==0.5.0
6
+ anyio==3.7.1
7
+ anykeystore==0.2
8
+ apex==0.9.10.dev0
9
+ async-timeout==4.0.3
10
+ asynctest==0.13.0
11
+ attrs==24.2.0
12
+ beautifulsoup4==4.9.3
13
+ blessings==1.7
14
+ blis==0.7.11
15
+ boilerpipe3==1.1
16
+ catalogue==2.0.10
17
+ certifi @ file:///croot/certifi_1671487769961/work/certifi
18
+ charade==1.0.3
19
+ chardet==3.0.4
20
+ charset-normalizer==3.4.3
21
+ click==7.1.2
22
+ cryptacular==1.6.2
23
+ cycler==0.11.0
24
+ cymem==2.0.11
25
+ defusedxml==0.7.1
26
+ exceptiongroup==1.3.0
27
+ faiss-gpu==1.6.3
28
+ fastapi==0.103.2
29
+ ffmpy==0.3.2
30
+ filelock==3.0.12
31
+ Flask==2.0.0
32
+ Flask-Cors==3.0.8
33
+ frozenlist==1.3.3
34
+ fsspec==2023.1.0
35
+ future==1.0.0
36
+ gdown==4.7.3
37
  gpustat==0.6.0
38
+ gradio==3.34.0
39
+ gradio_client==0.2.6
40
+ greenlet==3.1.1
41
+ h11==0.14.0
42
  h5py==2.10.0
43
+ httpcore==0.17.3
44
+ httpx==0.24.1
45
+ huggingface-hub==0.16.4
46
+ hupper==1.12.1
47
+ idna==2.8
48
+ importlib-metadata==6.7.0
49
+ importlib-resources==5.12.0
50
+ itsdangerous==2.1.2
51
+ Jinja2==3.0.3
52
+ joblib==1.3.2
53
+ JPype1-py3==0.5.5.4
54
+ jsonschema==4.17.3
55
+ kiwisolver==1.4.5
56
+ linkify-it-py==2.0.3
57
+ lxml==4.6.3
58
+ markdown-it-py==2.2.0
59
+ MarkupSafe==2.1.5
60
+ matplotlib==3.3.1
61
+ mdit-py-plugins==0.3.3
62
+ mdurl==0.1.2
63
+ multidict==6.0.5
64
+ murmurhash==1.0.13
65
+ nltk==3.2.5
66
+ numpy==1.17.4
67
+ nvidia-ml-py3==7.352.0
68
+ oauthlib==3.2.2
69
+ orjson==3.9.7
70
+ packaging==24.0
71
  pandas==1.0.3
72
+ PasteDeploy==3.1.0
73
+ pathy==0.10.3
74
+ pbkdf2==1.3
75
+ Pillow==9.5.0
76
+ pkgutil_resolve_name==1.3.10
77
+ plaster==1.1.2
78
+ plaster-pastedeploy==1.0.1
79
+ preshed==3.0.10
80
+ psutil==7.0.0
81
+ pydantic==1.10.22
82
+ pydantic_core==2.14.6
83
+ pydub==0.25.1
84
+ Pygments==2.17.2
85
+ pyparsing==3.1.4
86
+ pyramid==2.0.2
87
+ pyramid-mailer==0.15.1
88
+ pyrsistent==0.19.3
89
+ PySocks==1.7.1
90
+ python-dateutil==2.8.1
91
+ python-multipart==0.0.8
92
+ python3-openid==3.2.0
93
+ pytz==2025.2
94
+ PyYAML==6.0.1
95
+ regex==2024.4.16
96
+ repoze.sendmail==4.4.1
97
+ requests==2.22.0
98
+ requests-oauthlib==2.0.0
99
+ sacremoses==0.0.53
100
+ scikit-learn==0.24.1
101
+ scipy==1.3.1
102
+ semantic-version==2.10.0
103
  sentence-transformers==0.3.0
104
+ sentencepiece==0.2.0
105
+ six==1.17.0
106
+ smart-open==6.4.0
107
+ sniffio==1.3.1
108
+ soupsieve==2.4.1
109
+ spacy==3.0.3
110
+ spacy-legacy==3.0.12
111
+ SQLAlchemy==1.4.54
112
+ srsly==2.4.8
113
+ starlette==0.27.0
114
+ syllables==0.1.0
115
  tantivy==0.13.2
116
+ thinc==8.0.17
117
+ threadpoolctl==3.1.0
118
+ tokenizers==0.8.1rc1
119
+ toolz==0.12.1
120
  torch==1.6.0+cu101
121
  torchvision==0.7.0+cu101
122
+ tqdm==4.67.1
123
+ transaction==4.0
124
+ transformers==3.0.2
125
+ translationstring==1.4
126
+ typer==0.3.2
127
+ typing_extensions==4.7.1
128
+ uc-micro-py==1.0.3
129
+ Unidecode==1.1.1
130
+ urllib3==1.25.11
131
+ uvicorn==0.22.0
132
+ velruse==1.1.1
133
+ venusian==3.1.1
134
+ wasabi==0.10.1
135
+ WebOb==1.8.9
136
+ websockets==11.0.3
137
+ Werkzeug==2.2.3
138
+ WTForms==3.0.1
139
+ wtforms-recaptcha==0.3.2
140
+ yarl==1.9.4
141
+ zipp==3.15.0
142
+ zope.deprecation==5.0
143
+ zope.interface==6.4.post2
144
+ zope.sqlalchemy==3.1
web_search.py CHANGED
@@ -1,13 +1,64 @@
 
1
  import re
2
 
3
  from pprint import pprint
4
  from urllib.request import Request, urlopen
5
  from html.parser import HTMLParser
6
  from bs4 import BeautifulSoup
7
- from urllib.parse import quote_plus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def bing_search(query: str, pages_number=1) -> list:
 
11
  """
12
  Gets web results from Bing
13
  :param query: query to search
@@ -15,7 +66,7 @@ def bing_search(query: str, pages_number=1) -> list:
15
  :return: a list of links in ranked order
16
  """
17
  urls = []
18
- for page in range(pages_number):
19
  first = page * 10 + 1
20
  address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
21
  data = get_html(address)
@@ -27,6 +78,9 @@ def bing_search(query: str, pages_number=1) -> list:
27
 
28
 
29
  def duckduckgo_search(query: str, pages=1):
 
 
 
30
  urls = []
31
  start_index = 0
32
  for page in range(pages):
@@ -36,7 +90,10 @@ def duckduckgo_search(query: str, pages=1):
36
  links = soup.findAll('a', {'class': 'result__a'})
37
  urls.extend([link['href'] for link in links])
38
  start_index = len(urls)
39
-
 
 
 
40
  return urls
41
 
42
 
@@ -58,8 +115,10 @@ def get_html(url: str) -> str:
58
  :return: html source code
59
  """
60
  try:
61
- custom_user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
62
- req = Request(url, headers={"User-Agent": custom_user_agent})
 
 
63
  page = urlopen(req, timeout=3)
64
  return str(page.read())
65
  except:
 
1
+ import random
2
  import re
3
 
4
  from pprint import pprint
5
  from urllib.request import Request, urlopen
6
  from html.parser import HTMLParser
7
  from bs4 import BeautifulSoup
8
+ from urllib.parse import quote_plus, parse_qs
9
+
10
+ import requests, lxml, json
11
+ import logging
12
+
13
+ import pdb
14
+
15
+ def USER_AGENT():
16
+ uastrings = [
17
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
18
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9',
19
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
20
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
21
+ ]
22
+ ua = random.choice(uastrings)
23
+ logging.info(f'Using user agent: {ua}')
24
+ return ua
25
+
26
+
27
+ def brave_search(query: str, pages=1) -> list:
28
+ params = {
29
+ 'q': quote_plus(query),
30
+ 'source': 'web',
31
+ 'tf': 'at',
32
+ }
33
+ headers = {
34
+ 'User-Agent': USER_AGENT()
35
+ }
36
 
37
+ urls = []
38
+ for page in range(pages):
39
+ params['offset'] = page
40
+ html = requests.get('https://search.brave.com/search', headers=headers, params=params)
41
+ soup = BeautifulSoup(html.text, 'lxml')
42
+
43
+ for result in soup.select('.snippet'):
44
+ link = result.select_one('.result-header').get('href')
45
+ """
46
+ sitelinks_container = result.select('.deep-results-buttons .deep-link')
47
+ sitelinks = None
48
+
49
+ if sitelinks_container:
50
+ sitelinks = []
51
+ for sitelink in sitelinks_container:
52
+ sitelinks.append({
53
+ 'title': sitelink.get_text().strip(),
54
+ 'link': sitelink.get('href')
55
+ })
56
+ """
57
+ urls.append(link)
58
+ return urls
59
 
60
+
61
+ def bing_search(query: str, pages=1) -> list:
62
  """
63
  Gets web results from Bing
64
  :param query: query to search
 
66
  :return: a list of links in ranked order
67
  """
68
  urls = []
69
+ for page in range(pages):
70
  first = page * 10 + 1
71
  address = "https://www.bing.com/search?q=" + quote_plus(query) + '&first=' + str(first)
72
  data = get_html(address)
 
78
 
79
 
80
  def duckduckgo_search(query: str, pages=1):
81
+ """
82
+ NOT WORKING; LIKELY BLOCKED
83
+ """
84
  urls = []
85
  start_index = 0
86
  for page in range(pages):
 
90
  links = soup.findAll('a', {'class': 'result__a'})
91
  urls.extend([link['href'] for link in links])
92
  start_index = len(urls)
93
+ try:
94
+ urls = [parse_qs(l.split('/')[-1][5:])[''][0] for l in urls]
95
+ except:
96
+ logging.warn(f'Parsing failed for {len(urls)} urls')
97
  return urls
98
 
99
 
 
115
  :return: html source code
116
  """
117
  try:
118
+ headers = {
119
+ 'User-Agent': USER_AGENT()
120
+ }
121
+ req = Request(url, headers=headers)
122
  page = urlopen(req, timeout=3)
123
  return str(page.read())
124
  except: