Spaces:

rmoxon
/

strandtest

Paused

App Files Files Community

rmoxon commited on Jul 15

Commit

eba5056

verified ·

1 Parent(s): 9ee1beb

Upload 2 files

Browse files

Files changed (2) hide show

app.py +90 -30
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -3,7 +3,19 @@ import tempfile
 from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import CLIPProcessor, CLIPModel, ClapModel, ClapProcessor
 import torch
 from PIL import Image
 import requests
@@ -28,7 +40,7 @@ app = FastAPI(title="CLIP Service", version="1.0.0")
 class CLIPService:
     def __init__(self):
-        logger.info("Loading CLIP and CLAP models...")
         try:
             # Use CPU for Hugging Face free tier
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -47,25 +59,48 @@ class CLIPService:
                 local_files_only=False
             )
-            # Load CLAP model for audio processing
-            self.clap_model = ClapModel.from_pretrained(
-                "laion/clap-htsat-unfused",
-                cache_dir=cache_dir,
-                local_files_only=False
-            ).to(self.device)
-            self.clap_processor = ClapProcessor.from_pretrained(
-                "laion/clap-htsat-unfused",
-                cache_dir=cache_dir,
-                local_files_only=False
-            )
-            logger.info(f"CLIP and CLAP models loaded successfully on {self.device}")
         except Exception as e:
-            logger.error(f"Failed to load models: {str(e)}")
             raise RuntimeError(f"Model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
         """Check if image format is supported by PIL/CLIP"""
         unsupported_extensions = ['.avif', '.heic', '.heif']
@@ -193,6 +228,9 @@ class CLIPService:
         try:
             logger.info(f"Processing audio: {audio_url}")
             # Download audio file
             response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
             response.raise_for_status()
@@ -212,20 +250,36 @@ class CLIPService:
                 if len(audio_array) > max_length:
                     audio_array = audio_array[:max_length]
-                # Process with CLAP
-                inputs = self.clap_processor(
-                    audios=audio_array,
-                    sampling_rate=48000,
-                    return_tensors="pt"
-                )
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                with torch.no_grad():
-                    audio_features = self.clap_model.get_audio_features(**inputs)
-                    audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
-                return audio_features.cpu().numpy().flatten().tolist()
             finally:
                 # Clean up temp file
@@ -286,6 +340,9 @@ async def encode_audio(request: AudioRequest):
     if not clip_service:
         raise HTTPException(status_code=503, detail="CLAP service not available")
     embedding = clip_service.encode_audio(request.audio_url)
     return {"embedding": embedding, "dimensions": len(embedding)}
@@ -300,7 +357,10 @@ async def health_check():
     return {
         "status": "healthy",
-        "models": ["clip-vit-large-patch14", "clap-htsat-unfused"],
         "device": clip_service.device,
         "service": "ready",
         "cache_dir": cache_dir

 from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from transformers import CLIPProcessor, CLIPModel
+try:
+    from transformers import ClapModel, ClapProcessor
+    CLAP_AVAILABLE = True
+    CLAP_METHOD = "transformers"
+except ImportError:
+    try:
+        import laion_clap
+        CLAP_AVAILABLE = True
+        CLAP_METHOD = "laion"
+    except ImportError:
+        CLAP_AVAILABLE = False
+        CLAP_METHOD = None
 import torch
 from PIL import Image
 import requests
 class CLIPService:
     def __init__(self):
+        logger.info("Loading CLIP model...")
         try:
             # Use CPU for Hugging Face free tier
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
                 local_files_only=False
             )
+            # Initialize CLAP model placeholders (loaded on demand)
+            self.clap_model = None
+            self.clap_processor = None
+            logger.info(f"CLIP model loaded successfully on {self.device}")
         except Exception as e:
+            logger.error(f"Failed to load CLIP model: {str(e)}")
             raise RuntimeError(f"Model loading failed: {str(e)}")
+    def _load_clap_model(self):
+        """Load CLAP model on demand"""
+        if not CLAP_AVAILABLE:
+            raise RuntimeError("CLAP model not available")
+        if self.clap_model is None:
+            logger.info(f"Loading CLAP model on demand using {CLAP_METHOD} method...")
+            try:
+                if CLAP_METHOD == "transformers":
+                    self.clap_model = ClapModel.from_pretrained(
+                        "laion/clap-htsat-unfused",
+                        cache_dir=cache_dir,
+                        local_files_only=False
+                    ).to(self.device)
+                    self.clap_processor = ClapProcessor.from_pretrained(
+                        "laion/clap-htsat-unfused",
+                        cache_dir=cache_dir,
+                        local_files_only=False
+                    )
+                elif CLAP_METHOD == "laion":
+                    # Use the official LAION CLAP library
+                    self.clap_model = laion_clap.CLAP_Module(enable_fusion=False)
+                    self.clap_model.load_ckpt()  # Load the default checkpoint
+                logger.info(f"CLAP model loaded successfully on {self.device} using {CLAP_METHOD}")
+            except Exception as e:
+                logger.error(f"Failed to load CLAP model: {str(e)}")
+                raise RuntimeError(f"CLAP model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
         """Check if image format is supported by PIL/CLIP"""
         unsupported_extensions = ['.avif', '.heic', '.heif']
         try:
             logger.info(f"Processing audio: {audio_url}")
+            # Load CLAP model on demand
+            self._load_clap_model()
             # Download audio file
             response = requests.get(audio_url, timeout=60, headers={'User-Agent': 'CLAP-Service/1.0'})
             response.raise_for_status()
                 if len(audio_array) > max_length:
                     audio_array = audio_array[:max_length]
+                # Process with CLAP based on method
+                if CLAP_METHOD == "transformers":
+                    inputs = self.clap_processor(
+                        audios=audio_array,
+                        sampling_rate=48000,
+                        return_tensors="pt"
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    with torch.no_grad():
+                        audio_features = self.clap_model.get_audio_features(**inputs)
+                        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
+                    return audio_features.cpu().numpy().flatten().tolist()
+                elif CLAP_METHOD == "laion":
+                    # Use LAION CLAP library
+                    with torch.no_grad():
+                        audio_features = self.clap_model.get_audio_embedding_from_data(
+                            x=audio_array,
+                            use_tensor=True
+                        )
+                        # Normalize embedding
+                        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
+                    return audio_features.cpu().numpy().flatten().tolist()
+                else:
+                    raise RuntimeError(f"Unknown CLAP method: {CLAP_METHOD}")
             finally:
                 # Clean up temp file
     if not clip_service:
         raise HTTPException(status_code=503, detail="CLAP service not available")
+    if not CLAP_AVAILABLE:
+        raise HTTPException(status_code=501, detail="CLAP model not available in this transformers version")
     embedding = clip_service.encode_audio(request.audio_url)
     return {"embedding": embedding, "dimensions": len(embedding)}
     return {
         "status": "healthy",
+        "models": {
+            "clip": "clip-vit-large-patch14",
+            "clap": f"clap-htsat-unfused (lazy loaded, method: {CLAP_METHOD})" if CLAP_AVAILABLE else "not available"
+        },
         "device": clip_service.device,
         "service": "ready",
         "cache_dir": cache_dir

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 torch==2.0.1
-transformers==4.30.0
 Pillow==9.5.0
 requests==2.31.0
 fastapi==0.104.1
@@ -9,4 +9,5 @@ pydantic==2.5.0
 numpy<2.0.0
 librosa>=0.10.0
 soundfile>=0.12.1
-datasets>=2.14.0

 torch==2.0.1
+transformers>=4.35.0
 Pillow==9.5.0
 requests==2.31.0
 fastapi==0.104.1
 numpy<2.0.0
 librosa>=0.10.0
 soundfile>=0.12.1
+datasets>=2.14.0
+laion-clap