Spaces:
Sleeping
Sleeping
File size: 6,941 Bytes
aaa7944 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
from langchain.tools import tool
from bs4 import BeautifulSoup
import requests
import datetime
import json
import time
import re
def clean_text(text):
"""Clean text from HTML tags and extra whitespace"""
if not text:
return ""
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
@tool
def pmc_search(query: str) -> str:
"""Search PubMed Central (PMC) for articles"""
try:
# Base URLs for PubMed APIs
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
# Search parameters
search_params = {
"db": "pmc",
"term": query,
"retmax": 20,
"retmode": "json",
"sort": "relevance"
}
# Get article IDs
response = requests.get(search_url, params=search_params)
if not response.ok:
return json.dumps([{"error": "PubMed search failed"}])
try:
search_data = response.json()
article_ids = search_data.get("esearchresult", {}).get("idlist", [])
except:
# Fallback to XML parsing if JSON fails
soup = BeautifulSoup(response.text, 'xml')
article_ids = [id.text for id in soup.find_all('Id')]
articles = []
for pmid in article_ids:
try:
# Fetch article details
fetch_params = {
"db": "pmc",
"id": pmid,
"retmode": "xml"
}
article_response = requests.get(fetch_url, params=fetch_params)
if not article_response.ok:
continue
article_soup = BeautifulSoup(article_response.text, 'xml')
# Extract article data
title_elem = article_soup.find("article-title")
title = clean_text(title_elem.text if title_elem else "No title")
abstract_elem = article_soup.find("abstract")
abstract = clean_text(abstract_elem.text if abstract_elem else "No abstract")
authors = []
for author in article_soup.find_all(["author", "contrib"]):
surname = author.find(["surname", "last-name"])
given_name = author.find(["given-names", "first-name"])
if surname:
author_name = surname.text
if given_name:
author_name = f"{given_name.text} {author_name}"
authors.append(clean_text(author_name))
year_elem = article_soup.find(["pub-date", "year"])
year = year_elem.find("year").text if year_elem and year_elem.find("year") else "Unknown"
journal_elem = article_soup.find(["journal-title", "source"])
journal = clean_text(journal_elem.text if journal_elem else "Unknown Journal")
articles.append({
"id": pmid,
"title": title,
"authors": authors,
"year": year,
"journal": journal,
"abstract": abstract
})
# Add delay to avoid rate limiting
time.sleep(0.5)
except Exception as e:
continue
return json.dumps(articles, indent=2)
except Exception as e:
return json.dumps([{"error": f"PMC search failed: {str(e)}"}])
@tool
def google_scholar_search(query: str) -> str:
"""Search alternative sources for medical literature"""
try:
# Use alternative medical literature sources
search_urls = [
f"https://europepmc.org/webservices/rest/search?query={query}&format=json&pageSize=20",
f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit=20&fields=title,abstract,year,authors,venue"
]
results = []
for url in search_urls:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
if not response.ok:
continue
data = response.json()
# Handle Europe PMC response
if "resultList" in data:
for result in data["resultList"].get("result", []):
pub = {
"title": clean_text(result.get("title", "No title")),
"authors": [clean_text(author.get("fullName", "Unknown")) for author in result.get("authorList", {}).get("author", [])],
"year": result.get("pubYear", "Unknown"),
"journal": clean_text(result.get("journalTitle", "Unknown Journal")),
"abstract": clean_text(result.get("abstractText", "No abstract")),
"source": "Europe PMC"
}
if pub["title"] != "No title" and pub["abstract"] != "No abstract":
results.append(pub)
# Handle Semantic Scholar response
elif "data" in data:
for paper in data["data"]:
pub = {
"title": clean_text(paper.get("title", "No title")),
"authors": [clean_text(author.get("name", "Unknown")) for author in paper.get("authors", [])],
"year": paper.get("year", "Unknown"),
"journal": clean_text(paper.get("venue", "Unknown Journal")),
"abstract": clean_text(paper.get("abstract", "No abstract")),
"source": "Semantic Scholar"
}
if pub["title"] != "No title" and pub["abstract"] != "No abstract":
results.append(pub)
time.sleep(1) # Rate limiting
except Exception as e:
continue
return json.dumps(results, indent=2)
except Exception as e:
return json.dumps([{"error": f"Literature search failed: {str(e)}"}])
@tool
def today_tool() -> str:
"""Get today's date"""
return str(datetime.date.today()) |