NCTCMumbai commited on
Commit
03e03a6
·
verified ·
1 Parent(s): edfb3fb

Update middlewares/search_client.py

Browse files
Files changed (1) hide show
  1. middlewares/search_client.py +27 -18
middlewares/search_client.py CHANGED
@@ -2,14 +2,15 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import re
4
  import concurrent.futures
 
5
 
6
 
7
  class SearchClient:
8
  def __init__(self, vendor, engine_id=None, api_key=None):
9
  self.vendor = vendor
10
- if vendor == "google":
11
- self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
12
- elif vendor == "bing":
13
  self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
14
  self.headers = {
15
  "Ocp-Apim-Subscription-Key": api_key,
@@ -42,21 +43,25 @@ class SearchClient:
42
  print(f"Error fetching data from {link}: {e}")
43
  return results
44
 
45
- def _google_search(self, query, n_crawl):
46
- response = requests.get(self.endpoint, params={"q": query})
47
- search_results = response.json()
 
 
 
 
 
 
48
 
49
- results = []
50
- count = 0
51
- for item in search_results.get("items", []):
52
- if count >= n_crawl:
53
- break
54
 
55
- link = item["link"]
56
- results.append(link)
57
- count += 1
58
 
59
- text_results = self._fetch_text_from_links(results)
60
  return text_results
61
 
62
  def _bing_search(self, query, n_crawl):
@@ -77,9 +82,13 @@ class SearchClient:
77
  return text_results
78
 
79
  def search(self, query, n_crawl):
80
- if self.vendor == "google":
81
- return self._google_search(query, n_crawl)
82
- elif self.vendor == "bing":
83
  return self._bing_search(query, n_crawl)
84
  else:
85
  return "Invalid vendor"
 
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  import re
4
  import concurrent.futures
5
+ from GoogleNews import GoogleNews
6
 
7
 
8
  class SearchClient:
9
  def __init__(self, vendor, engine_id=None, api_key=None):
10
  self.vendor = vendor
11
+ # if vendor == "google":
12
+ # self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
13
+ if vendor == "bing":
14
  self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
15
  self.headers = {
16
  "Ocp-Apim-Subscription-Key": api_key,
 
43
  print(f"Error fetching data from {link}: {e}")
44
  return results
45
 
46
+ def _google_search(self, start_date,end_date, query, n_crawl):
47
+ keywords_lst=query.split(',')
48
+ for keyword in keywords_lst:
49
+ googlenews = GoogleNews(start=start_date,end=end_date)
50
+ googlenews = GoogleNews(lang='en', region='IN')
51
+ googlenews.get_news(keyword)
52
+ results=googlenews.results
53
+ texts = googlenews.get_texts() # List of news texts
54
+ links = googlenews.get_links() # List of news links
55
 
56
+ if len(texts)<n_crawl:
57
+ data = {'Keyword': [keyword]*len(texts), 'Links': links, 'Text': texts}
58
+ else:
59
+ data = {'Keyword': [keyword]*(n_crawl), 'Links': links[:n_crawl], 'Text': texts[:n_crawl]}
 
60
 
61
+ results=data['Links']
62
+ corrected_urls = ["https://" + url for url in results]
 
63
 
64
+ text_results = self._fetch_text_from_links(corrected_urls)
65
  return text_results
66
 
67
  def _bing_search(self, query, n_crawl):
 
82
  return text_results
83
 
84
  def search(self, query, n_crawl):
85
+ if self.vendor == "bing":
 
 
86
  return self._bing_search(query, n_crawl)
87
  else:
88
  return "Invalid vendor"
89
+
90
+ def search_google(self,start_date,end_date, query, n_crawl):
91
+ if self.vendor == "google":
92
+ return self._google_search(start_date,end_date,query, n_crawl)
93
+ else:
94
+ return "Invalid vendor"