""" retrieve_guidelines.py ---------------------- Tool for retrieving clinical practice guidelines, with focus on IDSA (Infectious Diseases Society of America) guidelines. This tool searches for and retrieves the most current clinical guidelines based on user queries about specific infectious disease topics, conditions, or pathogens. It leverages internet search to find official IDSA guidelines and extracts key recommendations, treatment algorithms, and clinical guidance. Key Features: - Searches official IDSA website and trusted medical sources - Filters results by relevance to specific infectious disease topics - Extracts key recommendations and treatment guidance - Provides proper citations and publication dates - Handles multiple guideline topics (pneumonia, UTI, sepsis, etc.) """ import asyncio import re from typing import Any, Dict, List, Union from tools.base import Tool from tools.utils import ToolExecutionError, logger class RetrieveGuidelinesTool(Tool): """ Tool for retrieving clinical practice guidelines, with focus on IDSA guidelines. This tool searches for current IDSA guidelines based on user queries about specific infectious disease conditions, pathogens, or clinical scenarios. """ def __init__(self) -> None: """Initialize the RetrieveGuidelinesTool.""" super().__init__() self.name = "retrieve_guidelines" self.description = "Retrieve clinical practice guidelines for specific infectious disease topics, conditions, or pathogens, with focus on IDSA guidelines." self.args_schema = { "type": "object", "properties": { "topic": { "type": "string", "description": "The infectious disease topic, condition, or pathogen to search for (e.g., 'pneumonia', 'UTI', 'sepsis', 'MRSA', 'C. difficile')" }, "specific_focus": { "type": "string", "description": "Optional: Specific aspect of the topic (e.g., 'treatment', 'diagnosis', 'prophylaxis', 'pediatric')", "default": "" } }, "required": ["topic"] } def openai_spec(self, legacy=False): """Return OpenAI function specification.""" return { "name": self.name, "description": self.description, "parameters": self.args_schema } async def run( self, topic: str, specific_focus: str = "" ) -> Union[List[Dict[str, Any]], Dict[str, Any]]: """ Retrieve the latest IDSA guidelines for the specified topic. Args: topic (str): The infectious disease topic to search for specific_focus (str, optional): Specific aspect to focus on Returns: Union[List[Dict[str, Any]], Dict[str, Any]]: Guidelines information or error dict """ try: # Import internet search tool from tools.internet_search import InternetSearchTool internet_tool = InternetSearchTool() # Construct search queries for IDSA guidelines search_queries = self._build_search_queries(topic, specific_focus) guidelines_data = [] for query in search_queries: try: # Search for guidelines search_results = await internet_tool.run(query) # Parse the string response into structured data if isinstance(search_results, str): parsed_results = self._parse_search_results(search_results) # Filter and process results relevant_guidelines = self._filter_idsa_guidelines(parsed_results, topic) guidelines_data.extend(relevant_guidelines) elif isinstance(search_results, list): # Handle list format (if returned) relevant_guidelines = self._filter_idsa_guidelines(search_results, topic) guidelines_data.extend(relevant_guidelines) except Exception as e: logger.warning(f"Search failed for query '{query}': {e}") continue # Remove duplicates and sort by relevance guidelines_data = self._deduplicate_and_rank(guidelines_data, topic) if not guidelines_data: # Fallback: try broader search for general treatment guidelines fallback_queries = [ f"site:idsociety.org {topic} treatment", f"site:idsociety.org {topic} management", f"site:idsociety.org {topic} clinical", f"IDSA {topic} therapy" ] for query in fallback_queries: try: search_results = await internet_tool.run(query) if isinstance(search_results, str): parsed_results = self._parse_search_results(search_results) relevant_guidelines = self._filter_idsa_guidelines(parsed_results, topic) guidelines_data.extend(relevant_guidelines) elif isinstance(search_results, list): relevant_guidelines = self._filter_idsa_guidelines(search_results, topic) guidelines_data.extend(relevant_guidelines) except Exception as e: continue guidelines_data = self._deduplicate_and_rank(guidelines_data, topic) if not guidelines_data: return { "error": f"No IDSA guidelines found for topic: {topic}", "suggestion": "Try searching for broader terms like 'infectious diseases', 'antimicrobial therapy', or specific pathogens. Note: IDSA may not have specific guidelines for all conditions.", "topic": topic, "guidelines": [], "note": "This search is limited to official IDSA guidelines only. For tuberculosis, IDSA may refer to CDC or WHO guidelines as the primary authorities." } # Extract key information from top results processed_guidelines = self._extract_guideline_info(guidelines_data[:5], topic) return { "topic": topic, "specific_focus": specific_focus, "guidelines_found": len(processed_guidelines), "guidelines": processed_guidelines, "search_timestamp": "2025-07-18", "source": "IDSA (Infectious Diseases Society of America)" } except Exception as e: logger.error(f"RetrieveGuidelinesTool failed: {e}", exc_info=True) raise ToolExecutionError(f"Failed to retrieve guidelines: {e}") def _build_search_queries(self, topic: str, specific_focus: str) -> List[str]: """Build comprehensive search queries for IDSA guidelines.""" queries = [] # Map common terms to more specific medical terms topic_mapping = { 'tuberculosis': ['tuberculosis', 'TB', 'mycobacterium tuberculosis', 'pulmonary tuberculosis'], 'pneumonia': ['pneumonia', 'community-acquired pneumonia', 'CAP', 'hospital-acquired pneumonia'], 'sepsis': ['sepsis', 'severe sepsis', 'septic shock', 'bloodstream infection'], 'meningitis': ['meningitis', 'bacterial meningitis', 'CNS infection'], 'endocarditis': ['endocarditis', 'infective endocarditis', 'valve infection'], 'uti': ['urinary tract infection', 'UTI', 'cystitis', 'pyelonephritis'] } # Get all variations of the topic topic_variations = topic_mapping.get(topic.lower(), [topic]) # Primary IDSA-specific queries for variation in topic_variations: queries.extend([ f"IDSA guidelines {variation}", f"IDSA clinical practice guidelines {variation}", f"Infectious Diseases Society of America {variation} guidelines", f"IDSA {variation} treatment guidelines", f"IDSA {variation} management recommendations", f"site:idsociety.org {variation} guidelines" ]) # Add specific focus if provided if specific_focus: for variation in topic_variations: queries.extend([ f"IDSA guidelines {variation} {specific_focus}", f"IDSA {variation} {specific_focus} recommendations" ]) # Add broader searches for less common conditions if topic.lower() in ['tuberculosis', 'tb']: queries.extend([ "IDSA mycobacterial infections guidelines", "IDSA tuberculosis screening guidelines", "IDSA latent tuberculosis treatment", "site:idsociety.org tuberculosis guidelines", "site:idsociety.org TB guidelines", "site:idsociety.org mycobacterium tuberculosis" ]) # Add year-specific searches for latest guidelines current_year = 2025 for year in [current_year, current_year-1, current_year-2]: queries.append(f"IDSA {topic} guidelines {year}") return queries[:15] # Limit to 15 most relevant queries def _parse_search_results(self, search_results_str: str) -> List[Dict]: """Parse the formatted search results string into structured data.""" results = [] # Split by entries (each entry starts with **) entries = re.split(r'\*\*([^*]+)\*\*', search_results_str) for i in range(1, len(entries), 2): # Skip first empty entry, then take every other if i + 1 < len(entries): title = entries[i].strip() content_and_link = entries[i + 1].strip() # Extract the link link_match = re.search(r'\[Read more\]\(([^)]+)\)', content_and_link) url = link_match.group(1) if link_match else "" # Extract the content (everything before the link) content = re.sub(r'\[Read more\]\([^)]+\)', '', content_and_link).strip() if title and url: results.append({ 'title': title, 'url': url, 'content': content, 'snippet': content }) return results def _filter_idsa_guidelines(self, search_results: List[Dict], topic: str) -> List[Dict]: """Filter search results to focus ONLY on official IDSA guidelines.""" filtered_results = [] for result in search_results: url = result.get('url', '').lower() title = result.get('title', '').lower() content = result.get('content', '').lower() # Check if it's from official IDSA sources ONLY is_official_idsa = any(domain in url for domain in [ 'idsociety.org', 'idsa.org', 'academic.oup.com/cid' # Clinical Infectious Diseases journal (IDSA's official journal) ]) # Check if it contains IDSA-specific guideline indicators is_idsa_guideline = any(indicator in title or indicator in content for indicator in [ 'idsa', 'infectious diseases society of america', 'infectious diseases society', 'idsa guideline', 'idsa guidelines', 'idsa clinical practice' ]) # Enhanced topic relevance check topic_keywords = self._get_topic_keywords(topic) topic_relevant = any(keyword in title or keyword in content for keyword in topic_keywords) # Only include if it's from official IDSA source AND contains guideline indicators AND is topic relevant if topic_relevant and (is_official_idsa or is_idsa_guideline): result['relevance_score'] = self._calculate_relevance_score(result, topic) filtered_results.append(result) return filtered_results def _get_topic_keywords(self, topic: str) -> List[str]: """Get relevant keywords for topic matching.""" base_keywords = [topic.lower(), *topic.lower().split()] # Add specific synonyms and related terms keyword_mapping = { 'tuberculosis': ['tuberculosis', 'tb', 'mycobacterium', 'pulmonary tb', 'latent tb', 'active tb'], 'pneumonia': ['pneumonia', 'cap', 'hospital-acquired', 'ventilator-associated', 'lung infection'], 'sepsis': ['sepsis', 'septic shock', 'bloodstream infection', 'bacteremia'], 'meningitis': ['meningitis', 'cns infection', 'bacterial meningitis', 'brain infection'], 'endocarditis': ['endocarditis', 'infective endocarditis', 'valve infection', 'heart infection'], 'uti': ['urinary tract infection', 'uti', 'cystitis', 'pyelonephritis', 'bladder infection'] } if topic.lower() in keyword_mapping: base_keywords.extend(keyword_mapping[topic.lower()]) return base_keywords def _calculate_relevance_score(self, result: Dict, topic: str) -> float: """Calculate relevance score for a search result.""" score = 0.0 url = result.get('url', '').lower() title = result.get('title', '').lower() content = result.get('content', '').lower() # Official IDSA sources get highest scores if 'idsociety.org' in url: score += 20.0 elif 'idsa.org' in url: score += 18.0 elif 'academic.oup.com/cid' in url: score += 15.0 # IDSA-specific terms get high scores idsa_terms = ['idsa', 'infectious diseases society of america', 'infectious diseases society'] for term in idsa_terms: if term in title: score += 10.0 elif term in content: score += 5.0 # Guideline-specific terms guideline_terms = ['guideline', 'guidelines', 'clinical practice', 'recommendations'] for term in guideline_terms: if term in title: score += 8.0 elif term in content: score += 4.0 # Topic relevance topic_keywords = self._get_topic_keywords(topic) for keyword in topic_keywords: if keyword in title: score += 6.0 elif keyword in content: score += 2.0 # Recency indicators recent_years = ['2025', '2024', '2023', '2022', '2021'] for year in recent_years: if year in title or year in content: score += 2.0 break return score def _deduplicate_and_rank(self, guidelines_data: List[Dict], topic: str) -> List[Dict]: """Remove duplicates and rank guidelines by relevance.""" # Remove duplicates based on URL seen_urls = set() unique_guidelines = [] for guideline in guidelines_data: url = guideline.get('url', '') if url not in seen_urls: seen_urls.add(url) unique_guidelines.append(guideline) # Sort by relevance score unique_guidelines.sort(key=lambda x: x.get('relevance_score', 0), reverse=True) return unique_guidelines def _extract_guideline_info(self, guidelines_data: List[Dict], topic: str) -> List[Dict]: """Extract key information from guideline search results.""" processed_guidelines = [] for guideline in guidelines_data: try: # Extract key information title = guideline.get('title', '') url = guideline.get('url', '') content = guideline.get('content', '') # Extract publication year pub_year = self._extract_publication_year(title, content) # Extract key recommendations recommendations = self._extract_recommendations(content) # Extract authors/organization authors = self._extract_authors(content) processed_guideline = { 'title': title, 'url': url, 'publication_year': pub_year, 'authors': authors, 'key_recommendations': recommendations, 'relevance_score': guideline.get('relevance_score', 0), 'summary': self._generate_summary(content, topic) } processed_guidelines.append(processed_guideline) except Exception as e: logger.warning(f"Failed to process guideline: {e}") continue return processed_guidelines def _extract_publication_year(self, title: str, content: str) -> str: """Extract publication year from title or content.""" # Look for years in title first year_pattern = r'\b(20\d{2})\b' for text in [title, content]: matches = re.findall(year_pattern, text) if matches: # Return the most recent year found return max(matches) return "Unknown" def _extract_recommendations(self, content: str) -> List[str]: """Extract key recommendations from guideline content.""" recommendations = [] # Look for common recommendation patterns recommendation_patterns = [ r'recommend[s]?\s+([^.]+)', r'should\s+([^.]+)', r'we\s+recommend\s+([^.]+)', r'grade\s+[AB]\s+recommendation[:\s]+([^.]+)', r'strong\s+recommendation[:\s]+([^.]+)' ] for pattern in recommendation_patterns: matches = re.findall(pattern, content, re.IGNORECASE) recommendations.extend(matches[:3]) # Limit to top 3 per pattern # Clean up recommendations cleaned_recommendations = [] for rec in recommendations: cleaned = rec.strip() if len(cleaned) > 20 and len(cleaned) < 200: # Reasonable length cleaned_recommendations.append(cleaned) return cleaned_recommendations[:5] # Return top 5 recommendations def _extract_authors(self, content: str) -> str: """Extract authors or organization from content.""" # Look for IDSA or author patterns author_patterns = [ r'infectious\s+diseases\s+society\s+of\s+america', r'idsa', r'authored?\s+by\s+([^.]+)', r'committee\s+([^.]+)' ] for pattern in author_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: if 'idsa' in pattern or 'infectious' in pattern: return "Infectious Diseases Society of America (IDSA)" else: return match.group(1).strip() return "IDSA" def _generate_summary(self, content: str, topic: str) -> str: """Generate a brief summary of the guideline.""" # Extract first few sentences that mention the topic sentences = content.split('.') relevant_sentences = [] for sentence in sentences[:10]: # Check first 10 sentences if topic.lower() in sentence.lower(): relevant_sentences.append(sentence.strip()) if len(relevant_sentences) >= 2: break if relevant_sentences: return '. '.join(relevant_sentences) + '.' else: # Return first sentence if no topic-specific content found return sentences[0].strip() + '.' if sentences else "IDSA clinical practice guideline."