import os
import tempfile
from pathlib import Path
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import CLIPProcessor, CLIPModel
try:
    from transformers import ClapModel, ClapProcessor
    CLAP_AVAILABLE = True
    CLAP_METHOD = "transformers"
except ImportError as e1:
    CLAP_AVAILABLE = False
    CLAP_METHOD = None

# Check MERT availability
try:
    from transformers import AutoModel, Wav2Vec2FeatureExtractor
    MERT_AVAILABLE = True
    MERT_METHOD = "transformers"
except ImportError as e2:
    MERT_AVAILABLE = False
    MERT_METHOD = None
import torch
import torchaudio
from PIL import Image
import requests
import numpy as np
import io
import logging
import librosa
import soundfile as sf
import scipy.signal

# Set environment to disable librosa caching
os.environ['LIBROSA_CACHE_DIR'] = '/tmp'
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# Disable librosa caching completely to avoid the __o_fold error
os.environ['LIBROSA_CACHE_LEVEL'] = '0'
os.environ['LIBROSA_CACHE_COMPRESS'] = '0'

# Check pitch-aware model availability
try:
    # Try to use a simpler pitch-aware approach with librosa
    import librosa
    PITCH_AWARE_AVAILABLE = True
    PITCH_METHOD = "librosa_chroma"
except ImportError:
    PITCH_AWARE_AVAILABLE = False
    PITCH_METHOD = None

# Fusion configuration
FUSION_MODE = os.environ.get('FUSION_MODE', 'VECTOR_CONCAT')  # VECTOR_CONCAT or SCORE_FUSION
FUSION_ALPHA = float(os.environ.get('FUSION_ALPHA', '0.6'))  # Alpha for score fusion
ENABLE_PITCH_FUSION = os.environ.get('ENABLE_PITCH_FUSION', 'false').lower() == 'true' and PITCH_AWARE_AVAILABLE
ENABLE_MERT_FUSION = os.environ.get('ENABLE_MERT_FUSION', 'false').lower() == 'true' and MERT_AVAILABLE

# Audio processing limits
MAX_AUDIO_DURATION_SEC = int(os.environ.get('MAX_AUDIO_DURATION_SEC', '600'))  # 10 minutes default

# Set up cache directories
cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
os.makedirs(cache_dir, exist_ok=True)
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir
os.environ['TORCH_HOME'] = cache_dir

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="CLIP Service", version="1.0.0")

def sanitize_for_json(embedding):
    """Sanitize embedding for JSON serialization"""
    if isinstance(embedding, np.ndarray):
        # Ensure finite values and convert to list
        embedding = np.nan_to_num(embedding, nan=0.0, posinf=0.0, neginf=0.0)
        return embedding.tolist()
    elif isinstance(embedding, list):
        # Check each element for finite values
        return [float(x) if np.isfinite(x) else 0.0 for x in embedding]
    else:
        return embedding

# Log CLAP, MERT, and pitch-aware availability after logger is initialized
logger.info(f"CLAP availability: {CLAP_AVAILABLE}, method: {CLAP_METHOD}")
logger.info(f"MERT availability: {MERT_AVAILABLE}, method: {MERT_METHOD}")
logger.info(f"Pitch-aware availability: {PITCH_AWARE_AVAILABLE}, method: {PITCH_METHOD}")
logger.info(f"Pitch fusion enabled: {ENABLE_PITCH_FUSION}")
logger.info(f"MERT fusion enabled: {ENABLE_MERT_FUSION}")
logger.info(f"Fusion mode: {FUSION_MODE}")
if FUSION_MODE == 'SCORE_FUSION':
    logger.info(f"Fusion alpha: {FUSION_ALPHA}")

class CLIPService:
    def __init__(self):
        logger.info("Loading CLIP model...")
        self.clap_model = None
        self.clap_processor = None
        self.mert_model = None
        self.mert_processor = None
        # Simple in-memory cache for pitch features keyed by audio hash
        # Using a dict avoids the "unhashable type: numpy.ndarray" error we hit with functools.lru_cache
        self.pitch_feature_cache: dict[int, np.ndarray] = {}
        
        try:
            # Use CPU for Hugging Face free tier
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info(f"Using device: {self.device}")
            
            # Load CLIP model with explicit cache directory
            logger.info("Loading CLIP model from HuggingFace...")
            self.clip_model = CLIPModel.from_pretrained(
                "openai/clip-vit-large-patch14",
                cache_dir=cache_dir,
                local_files_only=False
            ).to(self.device)
            
            logger.info("Loading CLIP processor...")
            use_fast = os.environ.get('USE_FAST_PROCESSOR', 'true').lower() == 'true'
            logger.info(f"Using fast processor: {use_fast}")
            self.clip_processor = CLIPProcessor.from_pretrained(
                "openai/clip-vit-large-patch14",
                cache_dir=cache_dir,
                local_files_only=False,
                use_fast=use_fast
            )
            
            logger.info(f"CLIP model loaded successfully on {self.device}")
            
        except Exception as e:
            logger.error(f"Failed to load CLIP model: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            raise RuntimeError(f"CLIP model loading failed: {str(e)}")
    
    def _load_clap_model(self):
        """Load CLAP model on demand"""
        if not CLAP_AVAILABLE:
            raise RuntimeError("CLAP model not available - transformers version may not support CLAP")
            
        if self.clap_model is None:
            logger.info(f"Loading CLAP model on demand using {CLAP_METHOD} method...")
            try:
                if CLAP_METHOD == "transformers":
                    logger.info("Loading CLAP model from HuggingFace...")
                    self.clap_model = ClapModel.from_pretrained(
                        "laion/clap-htsat-unfused",
                        cache_dir=cache_dir,
                        local_files_only=False
                    ).to(self.device)
                    
                    logger.info("Loading CLAP processor...")
                    use_fast = os.environ.get('USE_FAST_PROCESSOR', 'true').lower() == 'true'
                    self.clap_processor = ClapProcessor.from_pretrained(
                        "laion/clap-htsat-unfused",
                        cache_dir=cache_dir,
                        local_files_only=False,
                        use_fast=use_fast
                    )
                    
                logger.info(f"CLAP model loaded successfully on {self.device} using {CLAP_METHOD}")
                
            except Exception as e:
                logger.error(f"Failed to load CLAP model: {str(e)}")
                logger.error(f"Error type: {type(e).__name__}")
                raise RuntimeError(f"CLAP model loading failed: {str(e)}")
    
    def _load_mert_model(self):
        """Load MERT model on demand"""
        if not MERT_AVAILABLE:
            raise RuntimeError("MERT model not available - transformers version may not support MERT")
            
        if self.mert_model is None:
            logger.info(f"Loading MERT model on demand using {MERT_METHOD} method...")
            try:
                logger.info("Loading MERT model from HuggingFace...")
                self.mert_model = AutoModel.from_pretrained(
                    "m-a-p/MERT-v1-95M",
                    trust_remote_code=True,
                    cache_dir=cache_dir,
                    local_files_only=False
                ).to(self.device)
                
                # Guard against missing encoder (stale HF cache issue)
                if not hasattr(self.mert_model, "encoder"):
                    raise RuntimeError("MERT weights not loaded - clear HF cache and retry")
                
                logger.info("Loading MERT processor...")
                use_fast = os.environ.get('USE_FAST_PROCESSOR', 'true').lower() == 'true'
                self.mert_processor = Wav2Vec2FeatureExtractor.from_pretrained(
                    "m-a-p/MERT-v1-95M",
                    cache_dir=cache_dir,
                    local_files_only=False,
                    use_fast=use_fast
                )
                
                logger.info(f"MERT model loaded successfully on {self.device} using {MERT_METHOD}")
                
            except Exception as e:
                logger.error(f"Failed to load MERT model: {str(e)}")
                logger.error(f"Error type: {type(e).__name__}")
                raise RuntimeError(f"MERT model loading failed: {str(e)}")
    
    def extract_pitch_features(self, audio_array: np.ndarray, sample_rate: int) -> np.ndarray:
        """Extract pitch-aware features using numpy/scipy (avoiding all librosa caching issues) with a lightweight dict cache"""
        cache_key = hash(audio_array.tobytes())

        # Fast path: return cached result if we've already seen this audio chunk
        if cache_key in self.pitch_feature_cache:
            return self.pitch_feature_cache[cache_key]

        # Slow path: compute fresh features and store them
        features = self._extract_pitch_features_impl(audio_array, sample_rate)
        self.pitch_feature_cache[cache_key] = features
        return features
    
    def _extract_pitch_features_impl(self, audio_array: np.ndarray, sample_rate: int) -> np.ndarray:
        """Implementation of pitch feature extraction (cached)"""
        try:
            
            # Use pure numpy/scipy implementations to avoid all librosa caching issues
            features = []
            
            # Extract basic audio features using numpy/scipy only
            try:
                # Basic spectral features using numpy FFT
                spectral_features = self._extract_spectral_features_numpy(audio_array, sample_rate)
                features.extend(spectral_features)
                logger.info("✓ Spectral features extracted (numpy)")
            except Exception as e:
                logger.warning(f"Spectral feature extraction failed: {e}, using zeros")
                features.extend(np.zeros(20))  # 20 spectral features
            
            try:
                # Basic temporal features
                temporal_features = self._extract_temporal_features_numpy(audio_array, sample_rate)
                features.extend(temporal_features)
                logger.info("✓ Temporal features extracted (numpy)")
            except Exception as e:
                logger.warning(f"Temporal feature extraction failed: {e}, using zeros")
                features.extend(np.zeros(15))  # 15 temporal features
            
            try:
                # Basic frequency domain features
                frequency_features = self._extract_frequency_features_numpy(audio_array, sample_rate)
                features.extend(frequency_features)
                logger.info("✓ Frequency features extracted (numpy)")
            except Exception as e:
                logger.warning(f"Frequency feature extraction failed: {e}, using zeros")
                features.extend(np.zeros(25))  # 25 frequency features
            
            # Simple tempo estimation (fallback)
            try:
                tempo = self._estimate_tempo_numpy(audio_array, sample_rate)
                features.append(tempo)
                logger.info(f"✓ Tempo estimated: {tempo:.1f} BPM")
            except Exception as e:
                logger.warning(f"Tempo estimation failed: {e}, using default")
                features.append(120.0)  # Default BPM
            
            # Convert to numpy array and check for NaN/inf values
            pitch_features = np.array(features, dtype=np.float32)
            
            # Replace any NaN or inf values with 0
            pitch_features = np.nan_to_num(pitch_features, nan=0.0, posinf=0.0, neginf=0.0)
            
            # Ensure we have the expected 85 dimensions
            if len(pitch_features) < 85:
                # Pad with zeros if needed
                padding = np.zeros(85 - len(pitch_features), dtype=np.float32)
                pitch_features = np.concatenate([pitch_features, padding])
            elif len(pitch_features) > 85:
                # Truncate if too long
                pitch_features = pitch_features[:85]
            
            # L2 normalize
            norm = np.linalg.norm(pitch_features)
            if norm > 0:
                pitch_features = pitch_features / norm
            else:
                # If norm is 0, create a small non-zero vector
                pitch_features = np.ones(85, dtype=np.float32) * 0.001
                pitch_features = pitch_features / np.linalg.norm(pitch_features)
            
            # Final check for finite values
            pitch_features = np.nan_to_num(pitch_features, nan=0.0, posinf=0.0, neginf=0.0)
            
            # Result is automatically cached by LRU decorator
            
            logger.info(f"Extracted pitch features: {len(pitch_features)} dimensions")
            return pitch_features
            
        except Exception as e:
            logger.error(f"Error extracting pitch features: {str(e)}")
            # Return normalized zero vector if extraction fails
            zero_features = np.ones(85, dtype=np.float32) * 0.001
            return zero_features / np.linalg.norm(zero_features)
    
    def _extract_spectral_features_numpy(self, audio_array: np.ndarray, sample_rate: int) -> list:
        """Extract spectral features using only numpy (no librosa)"""
        # Compute FFT
        fft = np.fft.fft(audio_array)
        magnitude = np.abs(fft)
        freqs = np.fft.fftfreq(len(audio_array), 1/sample_rate)
        
        # Only use positive frequencies
        pos_freqs = freqs[:len(freqs)//2]
        pos_magnitude = magnitude[:len(magnitude)//2]
        
        # Spectral centroid
        spectral_centroid = np.sum(pos_freqs * pos_magnitude) / np.sum(pos_magnitude)
        
        # Spectral rolloff (95% of energy)
        cumsum = np.cumsum(pos_magnitude)
        rolloff_idx = np.where(cumsum >= 0.95 * cumsum[-1])[0]
        spectral_rolloff = pos_freqs[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
        
        # Spectral spread
        spectral_spread = np.sqrt(np.sum(((pos_freqs - spectral_centroid) ** 2) * pos_magnitude) / np.sum(pos_magnitude))
        
        # Zero crossing rate
        zero_crossings = np.where(np.diff(np.sign(audio_array)))[0]
        zcr = len(zero_crossings) / len(audio_array)
        
        # RMS energy
        rms = np.sqrt(np.mean(audio_array ** 2))
        
        # Basic spectral features
        features = [
            spectral_centroid / sample_rate,  # Normalize
            spectral_rolloff / sample_rate,   # Normalize
            spectral_spread / sample_rate,    # Normalize
            zcr,
            rms,
            np.max(pos_magnitude),
            np.mean(pos_magnitude),
            np.std(pos_magnitude),
            np.sum(pos_magnitude),
            np.var(pos_magnitude)
        ]
        
        # Add frequency band energies (10 bands)
        band_energies = []
        n_bands = 10
        for i in range(n_bands):
            start_idx = i * len(pos_magnitude) // n_bands
            end_idx = (i + 1) * len(pos_magnitude) // n_bands
            band_energy = np.sum(pos_magnitude[start_idx:end_idx])
            band_energies.append(band_energy)
        
        features.extend(band_energies)
        
        return features
    
    def _extract_temporal_features_numpy(self, audio_array: np.ndarray, sample_rate: int) -> list:
        """Extract temporal features using only numpy"""
        # Basic statistics
        mean_val = np.mean(audio_array)
        std_val = np.std(audio_array)
        skew_val = np.mean(((audio_array - mean_val) / std_val) ** 3)
        kurtosis_val = np.mean(((audio_array - mean_val) / std_val) ** 4)
        
        # Energy-based features
        energy = np.sum(audio_array ** 2)
        power = energy / len(audio_array)
        
        # Envelope features
        envelope = np.abs(audio_array)
        envelope_mean = np.mean(envelope)
        envelope_std = np.std(envelope)
        
        # Attack/decay characteristics
        envelope_diff = np.diff(envelope)
        attack_time = np.mean(envelope_diff[envelope_diff > 0])
        decay_time = np.mean(-envelope_diff[envelope_diff < 0])
        
        # Peak characteristics
        peaks = np.where(np.diff(np.sign(np.diff(audio_array))) < 0)[0]
        peak_density = len(peaks) / len(audio_array)
        
        # Dynamics
        dynamic_range = np.max(envelope) - np.min(envelope)
        
        features = [
            mean_val,
            std_val,
            skew_val,
            kurtosis_val,
            energy,
            power,
            envelope_mean,
            envelope_std,
            attack_time if np.isfinite(attack_time) else 0.0,
            decay_time if np.isfinite(decay_time) else 0.0,
            peak_density,
            dynamic_range,
            np.percentile(envelope, 25),
            np.percentile(envelope, 75),
            np.median(envelope)
        ]
        
        return features
    
    def _extract_frequency_features_numpy(self, audio_array: np.ndarray, sample_rate: int) -> list:
        """Extract frequency domain features using only numpy"""
        # Short-time analysis
        hop_length = 512
        frame_length = 2048
        
        features = []
        
        # Process in overlapping windows
        n_frames = (len(audio_array) - frame_length) // hop_length + 1
        frame_features = []
        
        for i in range(0, min(n_frames, 50)):  # Limit to 50 frames for performance
            start = i * hop_length
            end = start + frame_length
            
            if end > len(audio_array):
                break
                
            frame = audio_array[start:end]
            
            # Apply window
            window = np.hanning(len(frame))
            windowed_frame = frame * window
            
            # FFT
            fft = np.fft.fft(windowed_frame)
            magnitude = np.abs(fft)
            
            # Frame-level features
            frame_energy = np.sum(magnitude ** 2)
            frame_centroid = np.sum(np.arange(len(magnitude)) * magnitude) / np.sum(magnitude)
            
            frame_features.append([frame_energy, frame_centroid])
        
        if frame_features:
            frame_features = np.array(frame_features)
            
            # Aggregate features across frames
            features.extend([
                np.mean(frame_features[:, 0]),  # Mean energy
                np.std(frame_features[:, 0]),   # Energy std
                np.mean(frame_features[:, 1]),  # Mean centroid
                np.std(frame_features[:, 1]),   # Centroid std
                np.max(frame_features[:, 0]),   # Max energy
                np.min(frame_features[:, 0]),   # Min energy
                np.median(frame_features[:, 0]), # Median energy
                np.var(frame_features[:, 0]),   # Energy variance
                np.mean(np.diff(frame_features[:, 0])),  # Energy delta
                np.std(np.diff(frame_features[:, 0]))    # Energy delta std
            ])
        else:
            features.extend(np.zeros(10))
        
        # Add some basic harmonic features
        fft_full = np.fft.fft(audio_array)
        magnitude_full = np.abs(fft_full)
        
        # Find fundamental frequency (simple peak detection)
        freqs = np.fft.fftfreq(len(audio_array), 1/sample_rate)
        pos_freqs = freqs[:len(freqs)//2]
        pos_magnitude = magnitude_full[:len(magnitude_full)//2]
        
        # Harmonic features
        peak_idx = np.argmax(pos_magnitude)
        fundamental_freq = pos_freqs[peak_idx]
        
        # Harmonic ratios (simple approximation)
        harmonic_features = []
        for harmonic in [2, 3, 4, 5]:
            target_freq = fundamental_freq * harmonic
            if target_freq < sample_rate / 2:
                target_idx = np.argmin(np.abs(pos_freqs - target_freq))
                harmonic_ratio = pos_magnitude[target_idx] / pos_magnitude[peak_idx]
                harmonic_features.append(harmonic_ratio)
            else:
                harmonic_features.append(0.0)
        
        features.extend(harmonic_features)
        
        # Add more frequency-based features
        features.extend([
            fundamental_freq / sample_rate,  # Normalized fundamental
            np.sum(pos_magnitude > np.mean(pos_magnitude)),  # Number of significant peaks
            np.sum(pos_magnitude) / len(pos_magnitude),  # Average magnitude
            np.std(pos_magnitude),  # Magnitude std
            np.max(pos_magnitude) / np.mean(pos_magnitude),  # Peak prominence
            np.sum(pos_magnitude[:len(pos_magnitude)//4]),  # Low freq energy
            np.sum(pos_magnitude[len(pos_magnitude)//4:len(pos_magnitude)//2]),  # Mid freq energy
            np.sum(pos_magnitude[len(pos_magnitude)//2:3*len(pos_magnitude)//4]),  # High freq energy
            np.sum(pos_magnitude[3*len(pos_magnitude)//4:]),  # Very high freq energy
            np.corrcoef(pos_magnitude[:-1], pos_magnitude[1:])[0,1] if len(pos_magnitude) > 1 else 0.0,  # Spectral autocorr
            np.sum(np.diff(pos_magnitude) > 0) / len(pos_magnitude)  # Spectral flux
        ])
        
        return features
    
    def _estimate_tempo_numpy(self, audio_array: np.ndarray, sample_rate: int) -> float:
        """Simple tempo estimation using numpy only"""
        try:
            # Simple envelope-based tempo detection
            hop_length = 512
            frame_length = 2048
            
            # Calculate envelope
            envelope = np.abs(audio_array)
            
            # Downsample envelope
            n_frames = (len(envelope) - frame_length) // hop_length + 1
            envelope_frames = []
            
            for i in range(0, min(n_frames, 1000)):  # Limit frames
                start = i * hop_length
                end = start + frame_length
                if end > len(envelope):
                    break
                frame_energy = np.mean(envelope[start:end])
                envelope_frames.append(frame_energy)
            
            if len(envelope_frames) < 10:
                return 120.0
            
            envelope_frames = np.array(envelope_frames)
            
            # Find peaks in envelope
            from scipy.signal import find_peaks
            peaks, _ = find_peaks(envelope_frames, height=np.mean(envelope_frames))
            
            if len(peaks) > 2:
                # Calculate intervals between peaks
                peak_intervals = np.diff(peaks) * hop_length / sample_rate
                
                # Filter reasonable intervals (0.2 to 2 seconds)
                valid_intervals = peak_intervals[(peak_intervals > 0.2) & (peak_intervals < 2.0)]
                
                if len(valid_intervals) > 0:
                    avg_interval = np.mean(valid_intervals)
                    tempo = 60.0 / avg_interval
                    
                    # Constrain to reasonable range
                    tempo = max(60, min(200, tempo))
                    return tempo
            
            return 120.0
            
        except Exception as e:
            logger.warning(f"Numpy tempo estimation failed: {e}")
            return 120.0
    
    def extract_mert_features(self, audio_array: np.ndarray, sample_rate: int) -> np.ndarray:
        """Extract MERT features using the Music Understanding Model"""
        try:
            # Load MERT model on demand
            self._load_mert_model()
            
            # Resample to 24kHz for MERT processing if needed
            if sample_rate != 24000:
                logger.info(f"Resampling from {sample_rate}Hz to 24kHz for MERT...")
                from scipy.signal import resample
                target_length = int(len(audio_array) * 24000 / sample_rate)
                audio_array_24k = resample(audio_array, target_length)
            else:
                audio_array_24k = audio_array
            
            # Critical: Convert to float32 for MERT (community checkpoints expect float32)
            audio_array_24k = audio_array_24k.astype(np.float32, copy=False)
            
            # Use multi-window strategy like CLAP to capture variation
            total_duration = len(audio_array_24k) / 24000
            logger.info(f"Processing MERT features for {len(audio_array_24k)} samples at 24kHz ({total_duration:.1f} seconds)")
            
            # Multi-window sampling approach (5-second windows as per MERT paper)
            hop_seconds = 5   # Move window every 5 seconds  
            win_seconds = 5   # Each window is 5 seconds (MERT positional encodings trained on 5s)
            hop_samples = hop_seconds * 24000
            win_samples = win_seconds * 24000
            
            samples = []
            
            if total_duration <= win_seconds:
                # Short audio: use the entire thing
                logger.info("Short audio: using entire file for MERT")
                samples = [audio_array_24k]
            else:
                # Multi-window sampling: overlapping windows across entire track
                logger.info("Multi-window sampling for MERT: extracting overlapping windows")
                max_offset = int(total_duration - win_seconds) + 1
                
                for t in range(0, max_offset, hop_seconds):
                    start_sample = t * 24000
                    end_sample = start_sample + win_samples
                    
                    # Ensure we don't go beyond the audio length
                    if end_sample <= len(audio_array_24k):
                        window = audio_array_24k[start_sample:end_sample]
                        samples.append(window)
                    else:
                        # Last window: take what's available
                        window = audio_array_24k[start_sample:]
                        if len(window) >= win_samples // 2:  # At least 5 seconds
                            # Pad to full window length
                            padded_window = np.pad(window, (0, win_samples - len(window)))
                            samples.append(padded_window)
                        break
                
                logger.info(f"Generated {len(samples)} MERT windows covering entire track")
            
            # Process each sample and collect embeddings
            embeddings = []
            for i, sample in enumerate(samples):
                sample_length = win_samples
                if len(sample) < sample_length:
                    # Pad short samples with zeros
                    sample = np.pad(sample, (0, sample_length - len(sample)))
                
                logger.info(f"Processing MERT sample {i+1}/{len(samples)}")
                
                # Process with MERT processor
                inputs = self.mert_processor(
                    sample,
                    sampling_rate=24000,
                    return_tensors="pt"
                )
                
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    outputs = self.mert_model(**inputs, output_hidden_states=True)
                    
                    # Get all hidden states (13 layers for MERT-v1-95M)
                    all_hidden_states = outputs.hidden_states  # Shape: [13, batch_size, time_steps, 768]
                    
                    # Time reduction: average across time dimension
                    # Shape: [13, batch_size, 768]
                    time_reduced_states = torch.stack([layer.mean(dim=1) for layer in all_hidden_states])
                    
                    # Take the mean across all layers to get a single 768-dim representation
                    # This follows the typical approach for utterance-level representations
                    window_embedding = time_reduced_states.mean(dim=0).squeeze()  # Shape: [768]
                
                embeddings.append(window_embedding.cpu().numpy().flatten())
            
            # Average the raw embeddings from all samples (like CLAP)
            final_embedding = np.mean(embeddings, axis=0)
            
            # Ensure finite values
            mert_features = np.nan_to_num(final_embedding, nan=0.0, posinf=0.0, neginf=0.0)
            
            # L2 normalize
            norm = np.linalg.norm(mert_features)
            if norm > 0:
                mert_features = mert_features / norm
            else:
                # If norm is 0, create a small non-zero vector
                mert_features = np.ones(768, dtype=np.float32) * 0.001
                mert_features = mert_features / np.linalg.norm(mert_features)
            
            # Final check for finite values
            mert_features = np.nan_to_num(mert_features, nan=0.0, posinf=0.0, neginf=0.0)
            
            logger.info(f"Extracted MERT features: {len(mert_features)} dimensions (norm: {np.linalg.norm(mert_features):.6f})")
            return mert_features
            
        except Exception as e:
            logger.error(f"Error extracting MERT features: {str(e)}")
            # Return normalized zero vector if extraction fails
            zero_features = np.ones(768, dtype=np.float32) * 0.001
            return zero_features / np.linalg.norm(zero_features)
    
    def fuse_embeddings(self, clap_embedding: np.ndarray, pitch_features: np.ndarray = None, mert_features: np.ndarray = None) -> np.ndarray:
        """Fuse CLAP, pitch-aware, and/or MERT features based on fusion mode"""
        if FUSION_MODE == "VECTOR_CONCAT":
            # Normalize before and after concatenation (critical for proper similarity computation)
            embeddings_to_fuse = []
            
            # Step 1: Normalize each embedding individually (clap_u = normalize(clap))
            # Always include CLAP
            clap_u = clap_embedding / np.linalg.norm(clap_embedding)
            embeddings_to_fuse.append(clap_u)
            
            # Add MERT if available
            if mert_features is not None:
                mert_u = mert_features / np.linalg.norm(mert_features)
                embeddings_to_fuse.append(mert_u)
            
            # Add pitch features if available 
            if pitch_features is not None:
                pitch_u = pitch_features / np.linalg.norm(pitch_features)
                embeddings_to_fuse.append(pitch_u)
            
            # Step 2: Concatenate normalized embeddings (fused = cat([clap_u, mert_u]))
            fused = np.concatenate(embeddings_to_fuse)
            
            # Step 3: Normalize the concatenated result (fused = normalize(fused))
            # Critical: Without this, similarity is inflated by √2 (0.83 → 0.93)
            fused_norm = np.linalg.norm(fused)
            if fused_norm > 0:
                fused = fused / fused_norm
            else:
                logger.warning("Zero norm in fused embedding - creating fallback vector")
                fused = np.ones_like(fused) * 0.001
                fused = fused / np.linalg.norm(fused)
            
            # Verify norm is 1.0 (assert abs(fused.norm() - 1.0) < 1e-6)
            final_norm = np.linalg.norm(fused)
            if abs(final_norm - 1.0) > 1e-6:
                logger.warning(f"Fused embedding norm is {final_norm:.8f}, not 1.0 - normalization issue!")
            
            # Ensure finite values for JSON serialization
            fused = np.nan_to_num(fused, nan=0.0, posinf=0.0, neginf=0.0)
            
            return fused
        else:
            # SCORE_FUSION: return embeddings separately for weighted similarity calculation
            result = {}
            result["clap"] = np.nan_to_num(clap_embedding, nan=0.0, posinf=0.0, neginf=0.0)
            
            if mert_features is not None:
                result["mert"] = np.nan_to_num(mert_features, nan=0.0, posinf=0.0, neginf=0.0)
            
            if pitch_features is not None:
                result["pitch"] = np.nan_to_num(pitch_features, nan=0.0, posinf=0.0, neginf=0.0)
            
            return result
    
    def is_supported_format(self, image_url: str) -> bool:
        """Check if image format is supported by PIL/CLIP"""
        unsupported_extensions = ['.avif', '.heic', '.heif']
        url_lower = image_url.lower()
        return not any(url_lower.endswith(ext) for ext in unsupported_extensions)
    
    def detect_image_format(self, content: bytes) -> str:
        """Detect actual image format from content"""
        try:
            # Check for AVIF signature
            if content.startswith(b'\x00\x00\x00') and b'ftypavif' in content[:32]:
                return 'AVIF'
            # Check for HEIC signature  
            elif content.startswith(b'\x00\x00\x00') and b'ftyp' in content[:32] and (b'heic' in content[:32] or b'heix' in content[:32]):
                return 'HEIC'
            # Check for WebP
            elif content.startswith(b'RIFF') and b'WEBP' in content[:12]:
                return 'WebP'
            # Check for PNG
            elif content.startswith(b'\x89PNG\r\n\x1a\n'):
                return 'PNG'
            # Check for JPEG
            elif content.startswith(b'\xff\xd8\xff'):
                return 'JPEG'
            # Check for GIF
            elif content.startswith((b'GIF87a', b'GIF89a')):
                return 'GIF'
            else:
                return 'Unknown'
        except:
            return 'Unknown'

    def encode_image(self, image_url: str) -> list:
        try:
            logger.info(f"Processing image: {image_url}")
            
            # Quick URL-based format check first
            if not self.is_supported_format(image_url):
                logger.warning(f"Unsupported format detected from URL: {image_url}")
                raise HTTPException(status_code=422, detail="Unsupported image format (AVIF/HEIC not supported)")
            
            response = requests.get(image_url, timeout=30, headers={'User-Agent': 'CLIP-Service/1.0'})
            response.raise_for_status()
            
            # Detect actual format from content
            image_format = self.detect_image_format(response.content)
            logger.info(f"Detected image format: {image_format}")
            
            if image_format in ['AVIF', 'HEIC']:
                logger.warning(f"Unsupported format detected: {image_format} for {image_url}")
                raise HTTPException(status_code=422, detail=f"Unsupported image format: {image_format}")
            
            try:
                image = Image.open(io.BytesIO(response.content))
            except Exception as e:
                logger.error(f"PIL cannot open image {image_url}: {str(e)}")
                if "cannot identify image file" in str(e).lower():
                    raise HTTPException(status_code=422, detail="Unsupported or corrupted image format")
                raise
                
            if image.mode != 'RGB':
                logger.info(f"Converting image from {image.mode} to RGB")
                image = image.convert('RGB')
            
            # Resize image if too large to avoid memory issues
            max_size = 224  # CLIP's expected input size
            if max(image.size) > max_size:
                image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
            
            # Try multiple processor configurations
            try:
                # Method 1: Standard CLIP processing
                inputs = self.clip_processor(
                    images=image, 
                    return_tensors="pt",
                    do_rescale=True,
                    do_normalize=True
                )
            except Exception as e1:
                logger.warning(f"Method 1 failed: {e1}, trying method 2...")
                try:
                    # Method 2: With padding
                    inputs = self.clip_processor(
                        images=image, 
                        return_tensors="pt",
                        padding=True,
                        do_rescale=True,
                        do_normalize=True
                    )
                except Exception as e2:
                    logger.warning(f"Method 2 failed: {e2}, trying method 3...")
                    # Method 3: Manual preprocessing
                    inputs = self.clip_processor(
                        images=[image], 
                        return_tensors="pt"
                    )
            
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**inputs)
                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
                
            # Ensure safe values for JSON serialization
            embedding = image_features.cpu().numpy().flatten()
            embedding = np.nan_to_num(embedding, nan=0.0, posinf=0.0, neginf=0.0)
            return embedding.tolist()
            
        except Exception as e:
            logger.error(f"Error encoding image {image_url}: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Failed to encode image: {str(e)}")
    
    def encode_text(self, text: str) -> list:
        try:
            logger.info(f"Processing text: {text[:50]}...")
            inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
            
            with torch.no_grad():
                text_features = self.clip_model.get_text_features(**inputs)
                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
                
            # Ensure safe values for JSON serialization
            embedding = text_features.cpu().numpy().flatten()
            embedding = np.nan_to_num(embedding, nan=0.0, posinf=0.0, neginf=0.0)
            return embedding.tolist()
        except Exception as e:
            logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
            raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
    
    def encode_audio(self, audio_url: str) -> list:
        try:
            logger.info(f"Processing audio: {audio_url}")
            
            # Load CLAP model on demand
            self._load_clap_model()
            
            # Pitch fusion is enabled if pitch-aware features are available
            if ENABLE_PITCH_FUSION and PITCH_AWARE_AVAILABLE:
                logger.info("Pitch fusion enabled with librosa features")
            
            # Download audio file
            response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
            response.raise_for_status()
            
            # Save to temporary file
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
                tmp_file.write(response.content)
                tmp_path = tmp_file.name
            
            try:
                # Load audio using soundfile first, then resample with librosa if needed
                # This avoids the caching issues with librosa.load
                logger.info("Loading audio with soundfile...")
                audio_array, original_sr = sf.read(tmp_path)
                
                # Convert to mono if needed
                if len(audio_array.shape) > 1:
                    logger.info("Converting stereo to mono")
                    audio_array = audio_array.mean(axis=1)
                else:
                    logger.info("Audio is already mono")
                
                # Resample to 48kHz for CLAP processing
                if original_sr != 48000:
                    logger.info(f"Resampling from {original_sr}Hz to 48kHz...")
                    # Use scipy for resampling to avoid librosa caching issues
                    from scipy.signal import resample
                    target_length = int(len(audio_array) * 48000 / original_sr)
                    audio_array_48k = resample(audio_array, target_length)
                else:
                    audio_array_48k = audio_array
                
                # Critical: Convert to float32 for CLAP (community checkpoints expect float32)
                audio_array_48k = audio_array_48k.astype(np.float32, copy=False)
                
                total_duration = len(audio_array_48k) / 48000
                logger.info(f"Audio loaded: {len(audio_array_48k)} samples at 48kHz ({total_duration:.1f} seconds)")
                
                # Check audio duration limit and truncate both arrays consistently
                if total_duration > MAX_AUDIO_DURATION_SEC:
                    logger.warning(f"Audio duration {total_duration:.1f}s exceeds limit {MAX_AUDIO_DURATION_SEC}s, truncating...")
                    max_samples_48k = MAX_AUDIO_DURATION_SEC * 48000
                    max_samples_orig = MAX_AUDIO_DURATION_SEC * original_sr
                    
                    # Truncate both arrays to keep MERT and Pitch processing within limits
                    audio_array_48k = audio_array_48k[:max_samples_48k]
                    audio_array = audio_array[:max_samples_orig]
                    
                    total_duration = MAX_AUDIO_DURATION_SEC
                    logger.info(f"Truncated both arrays to {total_duration:.1f} seconds (48kHz: {len(audio_array_48k)} samples, {original_sr}Hz: {len(audio_array)} samples)")
                
                # Process with CLAP (10s windows, 5s hops)
                clap_embedding = self._process_clap_embeddings(audio_array_48k, total_duration)
                
                # Initialize additional features
                pitch_features = None
                mert_features = None
                
                # Process with MERT features if enabled
                if ENABLE_MERT_FUSION and MERT_AVAILABLE:
                    try:
                        logger.info("Processing with MERT features for fusion...")
                        mert_features = self.extract_mert_features(audio_array, original_sr)
                    except Exception as e:
                        logger.error(f"MERT feature processing failed: {str(e)}")
                        mert_features = None
                
                # Process with pitch-aware features if enabled (can run alongside MERT)
                if ENABLE_PITCH_FUSION and PITCH_AWARE_AVAILABLE:
                    try:
                        logger.info("Processing with pitch-aware features for fusion...")
                        pitch_features = self._process_pitch_features(audio_array, original_sr, total_duration)
                    except Exception as e:
                        logger.error(f"Pitch feature processing failed: {str(e)}")
                        pitch_features = None
                
                # Handle fusion based on what features are available
                if mert_features is not None or pitch_features is not None:
                    if FUSION_MODE == "VECTOR_CONCAT":
                        final_embedding = self.fuse_embeddings(clap_embedding, pitch_features, mert_features)
                        
                        if mert_features is not None:
                            logger.info(f"Fused embedding dimensions: {len(final_embedding)} (CLAP 512 + MERT 768)")
                        elif pitch_features is not None:
                            logger.info(f"Fused embedding dimensions: {len(final_embedding)} (CLAP 512 + Pitch 85)")
                        
                        return final_embedding.tolist()
                    else:
                        # SCORE_FUSION: return embeddings separately
                        logger.info("Score fusion mode: returning separate embeddings")
                        result = self.fuse_embeddings(clap_embedding, pitch_features, mert_features)
                        result["fusion_mode"] = "SCORE_FUSION"
                        result["fusion_alpha"] = FUSION_ALPHA
                        
                        # Convert to lists for JSON serialization
                        for key in result:
                            if isinstance(result[key], np.ndarray):
                                result[key] = result[key].tolist()
                        
                        return result
                else:
                    # CLAP-only processing (backwards compatible)
                    logger.info("Using CLAP-only processing")
                    return clap_embedding.tolist()
                
            finally:
                # Clean up temp file
                if os.path.exists(tmp_path):
                    os.unlink(tmp_path)
            
        except Exception as e:
            logger.error(f"Error encoding audio {audio_url}: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Failed to encode audio: {str(e)}")
    
    def _process_clap_embeddings(self, audio_array: np.ndarray, total_duration: float) -> np.ndarray:
        """Process audio with CLAP using 10s windows and 5s hops"""
        # Multi-window sampling approach for better discrimination
        hop_seconds = 5   # Move window every 5 seconds
        win_seconds = 10  # Each window is 10 seconds
        hop_samples = hop_seconds * 48000
        win_samples = win_seconds * 48000
        
        samples = []
        
        if total_duration <= win_seconds:
            # Short audio: use the entire thing
            logger.info("Short audio: using entire file for CLAP")
            samples = [audio_array]
        else:
            # Multi-window sampling: overlapping windows across entire track
            logger.info("Multi-window sampling for CLAP: extracting overlapping windows")
            max_offset = int(total_duration - win_seconds) + 1
            
            for t in range(0, max_offset, hop_seconds):
                start_sample = t * 48000
                end_sample = start_sample + win_samples
                
                # Ensure we don't go beyond the audio length
                if end_sample <= len(audio_array):
                    window = audio_array[start_sample:end_sample]
                    samples.append(window)
                else:
                    # Last window: take what's available
                    window = audio_array[start_sample:]
                    if len(window) >= win_samples // 2:  # At least 5 seconds
                        # Pad to full window length
                        padded_window = np.pad(window, (0, win_samples - len(window)))
                        samples.append(padded_window)
                    break
            
            logger.info(f"Generated {len(samples)} CLAP windows covering entire track")
        
        # Process each sample and collect embeddings
        embeddings = []
        for i, sample in enumerate(samples):
            sample_length = win_samples
            if len(sample) < sample_length:
                # Pad short samples with zeros
                sample = np.pad(sample, (0, sample_length - len(sample)))
            
            logger.info(f"Processing CLAP sample {i+1}/{len(samples)}")
            
            # Process with CLAP
            inputs = self.clap_processor(
                audios=sample, 
                sampling_rate=48000,
                return_tensors="pt"
            )
            
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                audio_features = self.clap_model.get_audio_features(**inputs)
                window_vec = audio_features.squeeze(0)  # 512-D, no L2
            
            embeddings.append(window_vec.cpu().numpy().flatten())
        
        # Average the raw embeddings from all samples
        final_embedding = np.mean(embeddings, axis=0)
        
        # Ensure finite values before normalization
        final_embedding = np.nan_to_num(final_embedding, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Ensure proper L2 normalization for cosine similarity
        norm = np.linalg.norm(final_embedding)
        if norm > 0:
            final_embedding = final_embedding / norm
        else:
            logger.warning("Zero norm CLAP embedding detected")
            # Create a small normalized vector instead of zero
            final_embedding = np.ones_like(final_embedding) * 0.001
            final_embedding = final_embedding / np.linalg.norm(final_embedding)
        
        # Final safety check for JSON serialization
        final_embedding = np.nan_to_num(final_embedding, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Verify normalization
        final_norm = np.linalg.norm(final_embedding)
        logger.info(f"Final CLAP embedding norm: {final_norm:.6f} (should be ~1.0)")
        
        return final_embedding
    
    def _process_pitch_features(self, audio_array: np.ndarray, original_sr: int, total_duration: float) -> np.ndarray:
        """Process audio with pitch-aware features using 5s windows and 2s hops"""
        # Pitch feature processing parameters
        hop_seconds = 2   # Move window every 2 seconds
        win_seconds = 5   # Each window is 5 seconds
        hop_samples = hop_seconds * original_sr
        win_samples = win_seconds * original_sr
        
        samples = []
        
        if total_duration <= win_seconds:
            # Short audio: use the entire thing
            logger.info("Short audio: using entire file for pitch features")
            samples = [audio_array]
        else:
            # Multi-window sampling: overlapping windows across entire track
            logger.info("Multi-window sampling for pitch features: extracting overlapping windows")
            max_offset = int(total_duration - win_seconds) + 1
            
            for t in range(0, max_offset, hop_seconds):
                start_sample = t * original_sr
                end_sample = start_sample + win_samples
                
                # Ensure we don't go beyond the audio length
                if end_sample <= len(audio_array):
                    window = audio_array[start_sample:end_sample]
                    samples.append(window)
                else:
                    # Last window: take what's available
                    window = audio_array[start_sample:]
                    if len(window) >= win_samples // 2:  # At least 2.5 seconds
                        # Pad to full window length
                        padded_window = np.pad(window, (0, win_samples - len(window)))
                        samples.append(padded_window)
                    break
            
            logger.info(f"Generated {len(samples)} pitch feature windows covering entire track")
        
        # Process each sample and collect features
        feature_vectors = []
        for i, sample in enumerate(samples):
            sample_length = win_samples
            if len(sample) < sample_length:
                # Pad short samples with zeros
                sample = np.pad(sample, (0, sample_length - len(sample)))
            
            logger.info(f"Processing pitch features sample {i+1}/{len(samples)}")
            
            # Extract pitch features
            pitch_features = self.extract_pitch_features(sample, original_sr)
            feature_vectors.append(pitch_features)
        
        # Average the raw feature vectors from all samples
        final_features = np.mean(feature_vectors, axis=0)
        
        # Ensure finite values before normalization
        final_features = np.nan_to_num(final_features, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Ensure proper L2 normalization for cosine similarity
        norm = np.linalg.norm(final_features)
        if norm > 0:
            final_features = final_features / norm
        else:
            logger.warning("Zero norm pitch features detected")
            # Create a small normalized vector instead of zero
            final_features = np.ones_like(final_features) * 0.001
            final_features = final_features / np.linalg.norm(final_features)
        
        # Final safety check for JSON serialization
        final_features = np.nan_to_num(final_features, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Verify normalization
        final_norm = np.linalg.norm(final_features)
        logger.info(f"Final pitch features norm: {final_norm:.6f} (should be ~1.0)")
        
        return final_features

# Initialize service with error handling
logger.info("Initializing CLIP service...")
try:
    clip_service = CLIPService()
    logger.info("CLIP service initialized successfully!")
except Exception as e:
    logger.error(f"Failed to initialize CLIP service: {str(e)}")
    logger.error(f"Error details: {type(e).__name__}: {str(e)}")
    # For now, we'll let the app start but service calls will fail gracefully
    clip_service = None

class ImageRequest(BaseModel):
    image_url: str

class TextRequest(BaseModel):
    text: str

class AudioRequest(BaseModel):
    audio_url: str

@app.get("/")
async def root():
    return {
        "message": "CLIP Service API", 
        "version": "1.0.0",
        "model": "clip-vit-large-patch14",
        "endpoints": ["/encode/image", "/encode/text", "/encode/audio", "/health"],
        "status": "ready" if clip_service else "error"
    }

@app.post("/encode/image")
async def encode_image(request: ImageRequest):
    if not clip_service:
        raise HTTPException(status_code=503, detail="CLIP service not available")
    
    embedding = clip_service.encode_image(request.image_url)
    safe_embedding = sanitize_for_json(embedding)
    return {"embedding": safe_embedding, "dimensions": len(safe_embedding)}

@app.post("/encode/text")
async def encode_text(request: TextRequest):
    if not clip_service:
        raise HTTPException(status_code=503, detail="CLIP service not available")
    
    embedding = clip_service.encode_text(request.text)
    safe_embedding = sanitize_for_json(embedding)
    return {"embedding": safe_embedding, "dimensions": len(safe_embedding)}

@app.post("/encode/audio")
async def encode_audio(request: AudioRequest):
    if not clip_service:
        raise HTTPException(status_code=503, detail="CLAP service not available")
    
    if not CLAP_AVAILABLE:
        raise HTTPException(status_code=501, detail="CLAP model not available in this transformers version")
    
    embedding = clip_service.encode_audio(request.audio_url)
    
    # Handle both single embedding and fusion mode results
    if isinstance(embedding, dict):
        # Score fusion mode - sanitize all embeddings
        safe_embedding = {}
        dimensions = {}
        
        for key, value in embedding.items():
            if key in ["clap", "pitch", "mert"] and isinstance(value, list):
                safe_embedding[key] = sanitize_for_json(value)
                dimensions[key] = len(safe_embedding[key])
            else:
                safe_embedding[key] = value
        
        return {"embedding": safe_embedding, "dimensions": dimensions}
    else:
        # Single embedding (CLAP-only or concatenated)
        safe_embedding = sanitize_for_json(embedding)
        return {"embedding": safe_embedding, "dimensions": len(safe_embedding)}

@app.get("/health")
async def health_check():
    if not clip_service:
        return {
            "status": "unhealthy", 
            "model": "clip-vit-large-patch14",
            "error": "Service failed to initialize"
        }
    
    health_info = {
        "status": "healthy", 
        "models": {
            "clip": "clip-vit-large-patch14",
            "clap": f"clap-htsat-unfused (lazy loaded, method: {CLAP_METHOD})" if CLAP_AVAILABLE else "not available",
            "mert": f"MERT-v1-95M (lazy loaded, method: {MERT_METHOD})" if MERT_AVAILABLE else "not available"
        },
        "device": clip_service.device,
        "service": "ready",
        "cache_dir": cache_dir
    }
    
    # Add pitch-aware information
    if PITCH_AWARE_AVAILABLE:
        health_info["models"]["pitch_aware"] = f"librosa features ({PITCH_METHOD})"
    
    # Add fusion information
    fusion_enabled = ENABLE_PITCH_FUSION or ENABLE_MERT_FUSION
    if fusion_enabled:
        health_info["fusion"] = {
            "enabled": True,
            "mode": FUSION_MODE,
            "pitch_fusion_enabled": ENABLE_PITCH_FUSION,
            "mert_fusion_enabled": ENABLE_MERT_FUSION,
            "pitch_aware_available": PITCH_AWARE_AVAILABLE,
            "mert_available": MERT_AVAILABLE
        }
        if FUSION_MODE == "SCORE_FUSION":
            health_info["fusion"]["alpha"] = FUSION_ALPHA
    else:
        health_info["fusion"] = {
            "enabled": False,
            "mode": "CLAP_ONLY"
        }
    
    return health_info

if __name__ == "__main__":
    import uvicorn
    port = int(os.environ.get("PORT", 7860))  # Hugging Face uses port 7860
    uvicorn.run(app, host="0.0.0.0", port=port)