Spaces:
Sleeping
Sleeping
from langchain.tools import tool | |
from bs4 import BeautifulSoup | |
import requests | |
import datetime | |
import json | |
import time | |
import re | |
def clean_text(text): | |
"""Clean text from HTML tags and extra whitespace""" | |
if not text: | |
return "" | |
text = re.sub(r'<[^>]+>', '', text) | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def pmc_search(query: str) -> str: | |
"""Search PubMed Central (PMC) for articles""" | |
try: | |
# Base URLs for PubMed APIs | |
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" | |
# Search parameters | |
search_params = { | |
"db": "pmc", | |
"term": query, | |
"retmax": 20, | |
"retmode": "json", | |
"sort": "relevance" | |
} | |
# Get article IDs | |
response = requests.get(search_url, params=search_params) | |
if not response.ok: | |
return json.dumps([{"error": "PubMed search failed"}]) | |
try: | |
search_data = response.json() | |
article_ids = search_data.get("esearchresult", {}).get("idlist", []) | |
except: | |
# Fallback to XML parsing if JSON fails | |
soup = BeautifulSoup(response.text, 'xml') | |
article_ids = [id.text for id in soup.find_all('Id')] | |
articles = [] | |
for pmid in article_ids: | |
try: | |
# Fetch article details | |
fetch_params = { | |
"db": "pmc", | |
"id": pmid, | |
"retmode": "xml" | |
} | |
article_response = requests.get(fetch_url, params=fetch_params) | |
if not article_response.ok: | |
continue | |
article_soup = BeautifulSoup(article_response.text, 'xml') | |
# Extract article data | |
title_elem = article_soup.find("article-title") | |
title = clean_text(title_elem.text if title_elem else "No title") | |
abstract_elem = article_soup.find("abstract") | |
abstract = clean_text(abstract_elem.text if abstract_elem else "No abstract") | |
authors = [] | |
for author in article_soup.find_all(["author", "contrib"]): | |
surname = author.find(["surname", "last-name"]) | |
given_name = author.find(["given-names", "first-name"]) | |
if surname: | |
author_name = surname.text | |
if given_name: | |
author_name = f"{given_name.text} {author_name}" | |
authors.append(clean_text(author_name)) | |
year_elem = article_soup.find(["pub-date", "year"]) | |
year = year_elem.find("year").text if year_elem and year_elem.find("year") else "Unknown" | |
journal_elem = article_soup.find(["journal-title", "source"]) | |
journal = clean_text(journal_elem.text if journal_elem else "Unknown Journal") | |
articles.append({ | |
"id": pmid, | |
"title": title, | |
"authors": authors, | |
"year": year, | |
"journal": journal, | |
"abstract": abstract | |
}) | |
# Add delay to avoid rate limiting | |
time.sleep(0.5) | |
except Exception as e: | |
continue | |
return json.dumps(articles, indent=2) | |
except Exception as e: | |
return json.dumps([{"error": f"PMC search failed: {str(e)}"}]) | |
def google_scholar_search(query: str) -> str: | |
"""Search alternative sources for medical literature""" | |
try: | |
# Use alternative medical literature sources | |
search_urls = [ | |
f"https://europepmc.org/webservices/rest/search?query={query}&format=json&pageSize=20", | |
f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit=20&fields=title,abstract,year,authors,venue" | |
] | |
results = [] | |
for url in search_urls: | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers) | |
if not response.ok: | |
continue | |
data = response.json() | |
# Handle Europe PMC response | |
if "resultList" in data: | |
for result in data["resultList"].get("result", []): | |
pub = { | |
"title": clean_text(result.get("title", "No title")), | |
"authors": [clean_text(author.get("fullName", "Unknown")) for author in result.get("authorList", {}).get("author", [])], | |
"year": result.get("pubYear", "Unknown"), | |
"journal": clean_text(result.get("journalTitle", "Unknown Journal")), | |
"abstract": clean_text(result.get("abstractText", "No abstract")), | |
"source": "Europe PMC" | |
} | |
if pub["title"] != "No title" and pub["abstract"] != "No abstract": | |
results.append(pub) | |
# Handle Semantic Scholar response | |
elif "data" in data: | |
for paper in data["data"]: | |
pub = { | |
"title": clean_text(paper.get("title", "No title")), | |
"authors": [clean_text(author.get("name", "Unknown")) for author in paper.get("authors", [])], | |
"year": paper.get("year", "Unknown"), | |
"journal": clean_text(paper.get("venue", "Unknown Journal")), | |
"abstract": clean_text(paper.get("abstract", "No abstract")), | |
"source": "Semantic Scholar" | |
} | |
if pub["title"] != "No title" and pub["abstract"] != "No abstract": | |
results.append(pub) | |
time.sleep(1) # Rate limiting | |
except Exception as e: | |
continue | |
return json.dumps(results, indent=2) | |
except Exception as e: | |
return json.dumps([{"error": f"Literature search failed: {str(e)}"}]) | |
def today_tool() -> str: | |
"""Get today's date""" | |
return str(datetime.date.today()) |