"""
retrieve_guidelines.py
----------------------

Tool for retrieving clinical practice guidelines, with focus on IDSA (Infectious Diseases Society of America) guidelines.

This tool searches for and retrieves the most current clinical guidelines based on user queries about specific 
infectious disease topics, conditions, or pathogens. It leverages internet search to find official IDSA 
guidelines and extracts key recommendations, treatment algorithms, and clinical guidance.

Key Features:
- Searches official IDSA website and trusted medical sources
- Filters results by relevance to specific infectious disease topics
- Extracts key recommendations and treatment guidance
- Provides proper citations and publication dates
- Handles multiple guideline topics (pneumonia, UTI, sepsis, etc.)
"""

import asyncio
import re
from typing import Any, Dict, List, Union
from tools.base import Tool
from tools.utils import ToolExecutionError, logger

class RetrieveGuidelinesTool(Tool):
    """
    Tool for retrieving clinical practice guidelines, with focus on IDSA guidelines.
    
    This tool searches for current IDSA guidelines based on user queries about specific
    infectious disease conditions, pathogens, or clinical scenarios.
    """
    
    def __init__(self) -> None:
        """Initialize the RetrieveGuidelinesTool."""
        super().__init__()
        self.name = "retrieve_guidelines"
        self.description = "Retrieve clinical practice guidelines for specific infectious disease topics, conditions, or pathogens, with focus on IDSA guidelines."
        self.args_schema = {
            "type": "object",
            "properties": {
                "topic": {
                    "type": "string", 
                    "description": "The infectious disease topic, condition, or pathogen to search for (e.g., 'pneumonia', 'UTI', 'sepsis', 'MRSA', 'C. difficile')"
                },
                "specific_focus": {
                    "type": "string", 
                    "description": "Optional: Specific aspect of the topic (e.g., 'treatment', 'diagnosis', 'prophylaxis', 'pediatric')",
                    "default": ""
                }
            },
            "required": ["topic"]
        }

    def openai_spec(self, legacy=False):
        """Return OpenAI function specification."""
        return {
            "name": self.name,
            "description": self.description,
            "parameters": self.args_schema
        }

    async def run(
        self,
        topic: str,
        specific_focus: str = ""
    ) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
        """
        Retrieve the latest IDSA guidelines for the specified topic.
        
        Args:
            topic (str): The infectious disease topic to search for
            specific_focus (str, optional): Specific aspect to focus on
            
        Returns:
            Union[List[Dict[str, Any]], Dict[str, Any]]: Guidelines information or error dict
        """
        try:
            # Import internet search tool
            from tools.internet_search import InternetSearchTool
            internet_tool = InternetSearchTool()
            
            # Construct search queries for IDSA guidelines
            search_queries = self._build_search_queries(topic, specific_focus)
            
            guidelines_data = []
            
            for query in search_queries:
                try:
                    # Search for guidelines
                    search_results = await internet_tool.run(query)
                    
                    # Parse the string response into structured data
                    if isinstance(search_results, str):
                        parsed_results = self._parse_search_results(search_results)
                        # Filter and process results
                        relevant_guidelines = self._filter_idsa_guidelines(parsed_results, topic)
                        guidelines_data.extend(relevant_guidelines)
                    elif isinstance(search_results, list):
                        # Handle list format (if returned)
                        relevant_guidelines = self._filter_idsa_guidelines(search_results, topic)
                        guidelines_data.extend(relevant_guidelines)
                    
                except Exception as e:
                    logger.warning(f"Search failed for query '{query}': {e}")
                    continue
            
            # Remove duplicates and sort by relevance
            guidelines_data = self._deduplicate_and_rank(guidelines_data, topic)
            
            if not guidelines_data:
                # Fallback: try broader search for general treatment guidelines
                fallback_queries = [
                    f"site:idsociety.org {topic} treatment",
                    f"site:idsociety.org {topic} management",
                    f"site:idsociety.org {topic} clinical",
                    f"IDSA {topic} therapy"
                ]
                
                for query in fallback_queries:
                    try:
                        search_results = await internet_tool.run(query)
                        if isinstance(search_results, str):
                            parsed_results = self._parse_search_results(search_results)
                            relevant_guidelines = self._filter_idsa_guidelines(parsed_results, topic)
                            guidelines_data.extend(relevant_guidelines)
                        elif isinstance(search_results, list):
                            relevant_guidelines = self._filter_idsa_guidelines(search_results, topic)
                            guidelines_data.extend(relevant_guidelines)
                    except Exception as e:
                        continue
                
                guidelines_data = self._deduplicate_and_rank(guidelines_data, topic)
            
            if not guidelines_data:
                return {
                    "error": f"No IDSA guidelines found for topic: {topic}",
                    "suggestion": "Try searching for broader terms like 'infectious diseases', 'antimicrobial therapy', or specific pathogens. Note: IDSA may not have specific guidelines for all conditions.",
                    "topic": topic,
                    "guidelines": [],
                    "note": "This search is limited to official IDSA guidelines only. For tuberculosis, IDSA may refer to CDC or WHO guidelines as the primary authorities."
                }
            
            # Extract key information from top results
            processed_guidelines = self._extract_guideline_info(guidelines_data[:5], topic)
            
            return {
                "topic": topic,
                "specific_focus": specific_focus,
                "guidelines_found": len(processed_guidelines),
                "guidelines": processed_guidelines,
                "search_timestamp": "2025-07-18",
                "source": "IDSA (Infectious Diseases Society of America)"
            }
            
        except Exception as e:
            logger.error(f"RetrieveGuidelinesTool failed: {e}", exc_info=True)
            raise ToolExecutionError(f"Failed to retrieve guidelines: {e}")

    def _build_search_queries(self, topic: str, specific_focus: str) -> List[str]:
        """Build comprehensive search queries for IDSA guidelines."""
        queries = []
        
        # Map common terms to more specific medical terms
        topic_mapping = {
            'tuberculosis': ['tuberculosis', 'TB', 'mycobacterium tuberculosis', 'pulmonary tuberculosis'],
            'pneumonia': ['pneumonia', 'community-acquired pneumonia', 'CAP', 'hospital-acquired pneumonia'],
            'sepsis': ['sepsis', 'severe sepsis', 'septic shock', 'bloodstream infection'],
            'meningitis': ['meningitis', 'bacterial meningitis', 'CNS infection'],
            'endocarditis': ['endocarditis', 'infective endocarditis', 'valve infection'],
            'uti': ['urinary tract infection', 'UTI', 'cystitis', 'pyelonephritis']
        }
        
        # Get all variations of the topic
        topic_variations = topic_mapping.get(topic.lower(), [topic])
        
        # Primary IDSA-specific queries
        for variation in topic_variations:
            queries.extend([
                f"IDSA guidelines {variation}",
                f"IDSA clinical practice guidelines {variation}",
                f"Infectious Diseases Society of America {variation} guidelines",
                f"IDSA {variation} treatment guidelines",
                f"IDSA {variation} management recommendations",
                f"site:idsociety.org {variation} guidelines"
            ])
        
        # Add specific focus if provided
        if specific_focus:
            for variation in topic_variations:
                queries.extend([
                    f"IDSA guidelines {variation} {specific_focus}",
                    f"IDSA {variation} {specific_focus} recommendations"
                ])
        
        # Add broader searches for less common conditions
        if topic.lower() in ['tuberculosis', 'tb']:
            queries.extend([
                "IDSA mycobacterial infections guidelines",
                "IDSA tuberculosis screening guidelines", 
                "IDSA latent tuberculosis treatment",
                "site:idsociety.org tuberculosis guidelines",
                "site:idsociety.org TB guidelines",
                "site:idsociety.org mycobacterium tuberculosis"
            ])
        
        # Add year-specific searches for latest guidelines
        current_year = 2025
        for year in [current_year, current_year-1, current_year-2]:
            queries.append(f"IDSA {topic} guidelines {year}")
        
        return queries[:15]  # Limit to 15 most relevant queries

    def _parse_search_results(self, search_results_str: str) -> List[Dict]:
        """Parse the formatted search results string into structured data."""
        results = []
        
        # Split by entries (each entry starts with **)
        entries = re.split(r'\*\*([^*]+)\*\*', search_results_str)
        
        for i in range(1, len(entries), 2):  # Skip first empty entry, then take every other
            if i + 1 < len(entries):
                title = entries[i].strip()
                content_and_link = entries[i + 1].strip()
                
                # Extract the link
                link_match = re.search(r'\[Read more\]\(([^)]+)\)', content_and_link)
                url = link_match.group(1) if link_match else ""
                
                # Extract the content (everything before the link)
                content = re.sub(r'\[Read more\]\([^)]+\)', '', content_and_link).strip()
                
                if title and url:
                    results.append({
                        'title': title,
                        'url': url,
                        'content': content,
                        'snippet': content
                    })
        
        return results

    def _filter_idsa_guidelines(self, search_results: List[Dict], topic: str) -> List[Dict]:
        """Filter search results to focus ONLY on official IDSA guidelines."""
        filtered_results = []
        
        for result in search_results:
            url = result.get('url', '').lower()
            title = result.get('title', '').lower()
            content = result.get('content', '').lower()
            
            # Check if it's from official IDSA sources ONLY
            is_official_idsa = any(domain in url for domain in [
                'idsociety.org',
                'idsa.org',
                'academic.oup.com/cid'  # Clinical Infectious Diseases journal (IDSA's official journal)
            ])
            
            # Check if it contains IDSA-specific guideline indicators
            is_idsa_guideline = any(indicator in title or indicator in content for indicator in [
                'idsa', 'infectious diseases society of america', 'infectious diseases society',
                'idsa guideline', 'idsa guidelines', 'idsa clinical practice'
            ])
            
            # Enhanced topic relevance check
            topic_keywords = self._get_topic_keywords(topic)
            topic_relevant = any(keyword in title or keyword in content for keyword in topic_keywords)
            
            # Only include if it's from official IDSA source AND contains guideline indicators AND is topic relevant
            if topic_relevant and (is_official_idsa or is_idsa_guideline):
                result['relevance_score'] = self._calculate_relevance_score(result, topic)
                filtered_results.append(result)
        
        return filtered_results

    def _get_topic_keywords(self, topic: str) -> List[str]:
        """Get relevant keywords for topic matching."""
        base_keywords = [topic.lower(), *topic.lower().split()]
        
        # Add specific synonyms and related terms
        keyword_mapping = {
            'tuberculosis': ['tuberculosis', 'tb', 'mycobacterium', 'pulmonary tb', 'latent tb', 'active tb'],
            'pneumonia': ['pneumonia', 'cap', 'hospital-acquired', 'ventilator-associated', 'lung infection'],
            'sepsis': ['sepsis', 'septic shock', 'bloodstream infection', 'bacteremia'],
            'meningitis': ['meningitis', 'cns infection', 'bacterial meningitis', 'brain infection'],
            'endocarditis': ['endocarditis', 'infective endocarditis', 'valve infection', 'heart infection'],
            'uti': ['urinary tract infection', 'uti', 'cystitis', 'pyelonephritis', 'bladder infection']
        }
        
        if topic.lower() in keyword_mapping:
            base_keywords.extend(keyword_mapping[topic.lower()])
        
        return base_keywords

    def _calculate_relevance_score(self, result: Dict, topic: str) -> float:
        """Calculate relevance score for a search result."""
        score = 0.0
        
        url = result.get('url', '').lower()
        title = result.get('title', '').lower()
        content = result.get('content', '').lower()
        
        # Official IDSA sources get highest scores
        if 'idsociety.org' in url:
            score += 20.0
        elif 'idsa.org' in url:
            score += 18.0
        elif 'academic.oup.com/cid' in url:
            score += 15.0
        
        # IDSA-specific terms get high scores
        idsa_terms = ['idsa', 'infectious diseases society of america', 'infectious diseases society']
        for term in idsa_terms:
            if term in title:
                score += 10.0
            elif term in content:
                score += 5.0
        
        # Guideline-specific terms
        guideline_terms = ['guideline', 'guidelines', 'clinical practice', 'recommendations']
        for term in guideline_terms:
            if term in title:
                score += 8.0
            elif term in content:
                score += 4.0
        
        # Topic relevance
        topic_keywords = self._get_topic_keywords(topic)
        for keyword in topic_keywords:
            if keyword in title:
                score += 6.0
            elif keyword in content:
                score += 2.0
        
        # Recency indicators
        recent_years = ['2025', '2024', '2023', '2022', '2021']
        for year in recent_years:
            if year in title or year in content:
                score += 2.0
                break
        
        return score

    def _deduplicate_and_rank(self, guidelines_data: List[Dict], topic: str) -> List[Dict]:
        """Remove duplicates and rank guidelines by relevance."""
        # Remove duplicates based on URL
        seen_urls = set()
        unique_guidelines = []
        
        for guideline in guidelines_data:
            url = guideline.get('url', '')
            if url not in seen_urls:
                seen_urls.add(url)
                unique_guidelines.append(guideline)
        
        # Sort by relevance score
        unique_guidelines.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
        
        return unique_guidelines

    def _extract_guideline_info(self, guidelines_data: List[Dict], topic: str) -> List[Dict]:
        """Extract key information from guideline search results."""
        processed_guidelines = []
        
        for guideline in guidelines_data:
            try:
                # Extract key information
                title = guideline.get('title', '')
                url = guideline.get('url', '')
                content = guideline.get('content', '')
                
                # Extract publication year
                pub_year = self._extract_publication_year(title, content)
                
                # Extract key recommendations
                recommendations = self._extract_recommendations(content)
                
                # Extract authors/organization
                authors = self._extract_authors(content)
                
                processed_guideline = {
                    'title': title,
                    'url': url,
                    'publication_year': pub_year,
                    'authors': authors,
                    'key_recommendations': recommendations,
                    'relevance_score': guideline.get('relevance_score', 0),
                    'summary': self._generate_summary(content, topic)
                }
                
                processed_guidelines.append(processed_guideline)
                
            except Exception as e:
                logger.warning(f"Failed to process guideline: {e}")
                continue
        
        return processed_guidelines

    def _extract_publication_year(self, title: str, content: str) -> str:
        """Extract publication year from title or content."""
        # Look for years in title first
        year_pattern = r'\b(20\d{2})\b'
        
        for text in [title, content]:
            matches = re.findall(year_pattern, text)
            if matches:
                # Return the most recent year found
                return max(matches)
        
        return "Unknown"

    def _extract_recommendations(self, content: str) -> List[str]:
        """Extract key recommendations from guideline content."""
        recommendations = []
        
        # Look for common recommendation patterns
        recommendation_patterns = [
            r'recommend[s]?\s+([^.]+)',
            r'should\s+([^.]+)',
            r'we\s+recommend\s+([^.]+)',
            r'grade\s+[AB]\s+recommendation[:\s]+([^.]+)',
            r'strong\s+recommendation[:\s]+([^.]+)'
        ]
        
        for pattern in recommendation_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            recommendations.extend(matches[:3])  # Limit to top 3 per pattern
        
        # Clean up recommendations
        cleaned_recommendations = []
        for rec in recommendations:
            cleaned = rec.strip()
            if len(cleaned) > 20 and len(cleaned) < 200:  # Reasonable length
                cleaned_recommendations.append(cleaned)
        
        return cleaned_recommendations[:5]  # Return top 5 recommendations

    def _extract_authors(self, content: str) -> str:
        """Extract authors or organization from content."""
        # Look for IDSA or author patterns
        author_patterns = [
            r'infectious\s+diseases\s+society\s+of\s+america',
            r'idsa',
            r'authored?\s+by\s+([^.]+)',
            r'committee\s+([^.]+)'
        ]
        
        for pattern in author_patterns:
            match = re.search(pattern, content, re.IGNORECASE)
            if match:
                if 'idsa' in pattern or 'infectious' in pattern:
                    return "Infectious Diseases Society of America (IDSA)"
                else:
                    return match.group(1).strip()
        
        return "IDSA"

    def _generate_summary(self, content: str, topic: str) -> str:
        """Generate a brief summary of the guideline."""
        # Extract first few sentences that mention the topic
        sentences = content.split('.')
        relevant_sentences = []
        
        for sentence in sentences[:10]:  # Check first 10 sentences
            if topic.lower() in sentence.lower():
                relevant_sentences.append(sentence.strip())
                if len(relevant_sentences) >= 2:
                    break
        
        if relevant_sentences:
            return '. '.join(relevant_sentences) + '.'
        else:
            # Return first sentence if no topic-specific content found
            return sentences[0].strip() + '.' if sentences else "IDSA clinical practice guideline."