from langchain.tools import tool from scholarly import scholarly, ProxyGenerator from bs4 import BeautifulSoup import requests import datetime import json import time # Configure scholarly with proxy to avoid blocking pg = ProxyGenerator() pg.FreeProxies() scholarly.use_proxy(pg) @tool def pmc_search(query: str) -> str: """Search PubMed Central (PMC) for articles""" try: # Base URLs for PubMed APIs search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" # Search parameters search_params = { "db": "pmc", "term": query, "retmax": 20, "retmode": "json", "sort": "relevance" } # Get article IDs response = requests.get(search_url, params=search_params) if not response.ok: return json.dumps([{"error": "PubMed search failed"}]) try: search_data = response.json() article_ids = search_data.get("esearchresult", {}).get("idlist", []) except: # Fallback to XML parsing if JSON fails soup = BeautifulSoup(response.text, 'xml') article_ids = [id.text for id in soup.find_all('Id')] articles = [] for pmid in article_ids: try: # Fetch article details fetch_params = { "db": "pmc", "id": pmid, "retmode": "xml" } article_response = requests.get(fetch_url, params=fetch_params) if not article_response.ok: continue article_soup = BeautifulSoup(article_response.text, 'xml') # Extract article data title_elem = article_soup.find("article-title") title = title_elem.text if title_elem else "No title" abstract_elem = article_soup.find("abstract") abstract = abstract_elem.text if abstract_elem else "No abstract" authors = [] for author in article_soup.find_all(["author", "contrib"]): surname = author.find(["surname", "last-name"]) given_name = author.find(["given-names", "first-name"]) if surname: author_name = surname.text if given_name: author_name = f"{given_name.text} {author_name}" authors.append(author_name) year_elem = article_soup.find(["pub-date", "year"]) year = year_elem.find("year").text if year_elem and year_elem.find("year") else "Unknown" journal_elem = article_soup.find(["journal-title", "source"]) journal = journal_elem.text if journal_elem else "Unknown Journal" articles.append({ "id": pmid, "title": title, "authors": authors, "year": year, "journal": journal, "abstract": abstract }) # Add delay to avoid rate limiting time.sleep(0.5) except Exception as e: continue return json.dumps(articles, indent=2) except Exception as e: return json.dumps([{"error": f"PMC search failed: {str(e)}"}]) @tool def google_scholar_search(query: str) -> str: """Search Google Scholar for articles""" try: # Configure proxy and retry mechanism if not scholarly.use_proxy(pg): pg.FreeProxies() scholarly.use_proxy(pg) search_query = scholarly.search_pubs(query) results = [] count = 0 max_retries = 3 while count < 20: try: result = next(search_query) # Extract publication data pub = { "title": result.bib.get('title', 'No title'), "authors": result.bib.get('author', 'No author').split(" and "), "year": result.bib.get('year', 'No year'), "abstract": result.bib.get('abstract', 'No abstract'), "journal": result.bib.get('journal', result.bib.get('venue', 'No venue')), "citations": result.citedby if hasattr(result, 'citedby') else 0 } # Skip if no title or abstract if pub["title"] == 'No title' or pub["abstract"] == 'No abstract': continue results.append(pub) count += 1 # Add delay to avoid rate limiting time.sleep(0.5) except StopIteration: break except Exception as e: if max_retries > 0: max_retries -= 1 time.sleep(1) continue else: break return json.dumps(results, indent=2) except Exception as e: return json.dumps([{"error": f"Google Scholar search failed: {str(e)}"}]) @tool def today_tool() -> str: """Get today's date""" return str(datetime.date.today())