Spaces:
Sleeping
Sleeping
from langchain.tools import tool | |
from scholarly import scholarly, ProxyGenerator | |
from bs4 import BeautifulSoup | |
import requests | |
import datetime | |
import json | |
import time | |
# Configure scholarly with proxy to avoid blocking | |
pg = ProxyGenerator() | |
pg.FreeProxies() | |
scholarly.use_proxy(pg) | |
def pmc_search(query: str) -> str: | |
"""Search PubMed Central (PMC) for articles""" | |
try: | |
# Base URLs for PubMed APIs | |
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" | |
# Search parameters | |
search_params = { | |
"db": "pmc", | |
"term": query, | |
"retmax": 20, | |
"retmode": "json", | |
"sort": "relevance" | |
} | |
# Get article IDs | |
response = requests.get(search_url, params=search_params) | |
if not response.ok: | |
return json.dumps([{"error": "PubMed search failed"}]) | |
try: | |
search_data = response.json() | |
article_ids = search_data.get("esearchresult", {}).get("idlist", []) | |
except: | |
# Fallback to XML parsing if JSON fails | |
soup = BeautifulSoup(response.text, 'xml') | |
article_ids = [id.text for id in soup.find_all('Id')] | |
articles = [] | |
for pmid in article_ids: | |
try: | |
# Fetch article details | |
fetch_params = { | |
"db": "pmc", | |
"id": pmid, | |
"retmode": "xml" | |
} | |
article_response = requests.get(fetch_url, params=fetch_params) | |
if not article_response.ok: | |
continue | |
article_soup = BeautifulSoup(article_response.text, 'xml') | |
# Extract article data | |
title_elem = article_soup.find("article-title") | |
title = title_elem.text if title_elem else "No title" | |
abstract_elem = article_soup.find("abstract") | |
abstract = abstract_elem.text if abstract_elem else "No abstract" | |
authors = [] | |
for author in article_soup.find_all(["author", "contrib"]): | |
surname = author.find(["surname", "last-name"]) | |
given_name = author.find(["given-names", "first-name"]) | |
if surname: | |
author_name = surname.text | |
if given_name: | |
author_name = f"{given_name.text} {author_name}" | |
authors.append(author_name) | |
year_elem = article_soup.find(["pub-date", "year"]) | |
year = year_elem.find("year").text if year_elem and year_elem.find("year") else "Unknown" | |
journal_elem = article_soup.find(["journal-title", "source"]) | |
journal = journal_elem.text if journal_elem else "Unknown Journal" | |
articles.append({ | |
"id": pmid, | |
"title": title, | |
"authors": authors, | |
"year": year, | |
"journal": journal, | |
"abstract": abstract | |
}) | |
# Add delay to avoid rate limiting | |
time.sleep(0.5) | |
except Exception as e: | |
continue | |
return json.dumps(articles, indent=2) | |
except Exception as e: | |
return json.dumps([{"error": f"PMC search failed: {str(e)}"}]) | |
def google_scholar_search(query: str) -> str: | |
"""Search Google Scholar for articles""" | |
try: | |
# Configure proxy and retry mechanism | |
if not scholarly.use_proxy(pg): | |
pg.FreeProxies() | |
scholarly.use_proxy(pg) | |
search_query = scholarly.search_pubs(query) | |
results = [] | |
count = 0 | |
max_retries = 3 | |
while count < 20: | |
try: | |
result = next(search_query) | |
# Extract publication data | |
pub = { | |
"title": result.bib.get('title', 'No title'), | |
"authors": result.bib.get('author', 'No author').split(" and "), | |
"year": result.bib.get('year', 'No year'), | |
"abstract": result.bib.get('abstract', 'No abstract'), | |
"journal": result.bib.get('journal', result.bib.get('venue', 'No venue')), | |
"citations": result.citedby if hasattr(result, 'citedby') else 0 | |
} | |
# Skip if no title or abstract | |
if pub["title"] == 'No title' or pub["abstract"] == 'No abstract': | |
continue | |
results.append(pub) | |
count += 1 | |
# Add delay to avoid rate limiting | |
time.sleep(0.5) | |
except StopIteration: | |
break | |
except Exception as e: | |
if max_retries > 0: | |
max_retries -= 1 | |
time.sleep(1) | |
continue | |
else: | |
break | |
return json.dumps(results, indent=2) | |
except Exception as e: | |
return json.dumps([{"error": f"Google Scholar search failed: {str(e)}"}]) | |
def today_tool() -> str: | |
"""Get today's date""" | |
return str(datetime.date.today()) |