ferferefer's picture
Upload 6 files
243a67a verified
raw
history blame
5.69 kB
from langchain.tools import tool
from scholarly import scholarly, ProxyGenerator
from bs4 import BeautifulSoup
import requests
import datetime
import json
import time
# Configure scholarly with proxy to avoid blocking
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)
@tool
def pmc_search(query: str) -> str:
"""Search PubMed Central (PMC) for articles"""
try:
# Base URLs for PubMed APIs
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
# Search parameters
search_params = {
"db": "pmc",
"term": query,
"retmax": 20,
"retmode": "json",
"sort": "relevance"
}
# Get article IDs
response = requests.get(search_url, params=search_params)
if not response.ok:
return json.dumps([{"error": "PubMed search failed"}])
try:
search_data = response.json()
article_ids = search_data.get("esearchresult", {}).get("idlist", [])
except:
# Fallback to XML parsing if JSON fails
soup = BeautifulSoup(response.text, 'xml')
article_ids = [id.text for id in soup.find_all('Id')]
articles = []
for pmid in article_ids:
try:
# Fetch article details
fetch_params = {
"db": "pmc",
"id": pmid,
"retmode": "xml"
}
article_response = requests.get(fetch_url, params=fetch_params)
if not article_response.ok:
continue
article_soup = BeautifulSoup(article_response.text, 'xml')
# Extract article data
title_elem = article_soup.find("article-title")
title = title_elem.text if title_elem else "No title"
abstract_elem = article_soup.find("abstract")
abstract = abstract_elem.text if abstract_elem else "No abstract"
authors = []
for author in article_soup.find_all(["author", "contrib"]):
surname = author.find(["surname", "last-name"])
given_name = author.find(["given-names", "first-name"])
if surname:
author_name = surname.text
if given_name:
author_name = f"{given_name.text} {author_name}"
authors.append(author_name)
year_elem = article_soup.find(["pub-date", "year"])
year = year_elem.find("year").text if year_elem and year_elem.find("year") else "Unknown"
journal_elem = article_soup.find(["journal-title", "source"])
journal = journal_elem.text if journal_elem else "Unknown Journal"
articles.append({
"id": pmid,
"title": title,
"authors": authors,
"year": year,
"journal": journal,
"abstract": abstract
})
# Add delay to avoid rate limiting
time.sleep(0.5)
except Exception as e:
continue
return json.dumps(articles, indent=2)
except Exception as e:
return json.dumps([{"error": f"PMC search failed: {str(e)}"}])
@tool
def google_scholar_search(query: str) -> str:
"""Search Google Scholar for articles"""
try:
# Configure proxy and retry mechanism
if not scholarly.use_proxy(pg):
pg.FreeProxies()
scholarly.use_proxy(pg)
search_query = scholarly.search_pubs(query)
results = []
count = 0
max_retries = 3
while count < 20:
try:
result = next(search_query)
# Extract publication data
pub = {
"title": result.bib.get('title', 'No title'),
"authors": result.bib.get('author', 'No author').split(" and "),
"year": result.bib.get('year', 'No year'),
"abstract": result.bib.get('abstract', 'No abstract'),
"journal": result.bib.get('journal', result.bib.get('venue', 'No venue')),
"citations": result.citedby if hasattr(result, 'citedby') else 0
}
# Skip if no title or abstract
if pub["title"] == 'No title' or pub["abstract"] == 'No abstract':
continue
results.append(pub)
count += 1
# Add delay to avoid rate limiting
time.sleep(0.5)
except StopIteration:
break
except Exception as e:
if max_retries > 0:
max_retries -= 1
time.sleep(1)
continue
else:
break
return json.dumps(results, indent=2)
except Exception as e:
return json.dumps([{"error": f"Google Scholar search failed: {str(e)}"}])
@tool
def today_tool() -> str:
"""Get today's date"""
return str(datetime.date.today())