Spaces:

rmoxon
/

strandtest

Paused

App Files Files Community

rmoxon commited on Jul 15

Commit

c819b55

verified ·

1 Parent(s): 58c2e09

Upload 4 files

Browse files

Files changed (4) hide show

app-simple.py +239 -0
app.py +19 -42
requirements-simple.txt +9 -7
requirements.txt +9 -10

app-simple.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import tempfile
+from pathlib import Path
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from transformers import CLIPProcessor, CLIPModel
+import torch
+from PIL import Image
+import requests
+import numpy as np
+import io
+import logging
+# Set up cache directories
+cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/code/cache')
+os.makedirs(cache_dir, exist_ok=True)
+os.environ['TRANSFORMERS_CACHE'] = cache_dir
+os.environ['HF_HOME'] = cache_dir
+os.environ['TORCH_HOME'] = cache_dir
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="CLIP Service", version="1.0.0")
+class CLIPService:
+    def __init__(self):
+        logger.info("Loading CLIP model...")
+        try:
+            # Use CPU for Hugging Face free tier
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {self.device}")
+            # Load model with explicit cache directory
+            self.model = CLIPModel.from_pretrained(
+                "openai/clip-vit-large-patch14",
+                cache_dir=cache_dir,
+                local_files_only=False
+            ).to(self.device)
+            self.processor = CLIPProcessor.from_pretrained(
+                "openai/clip-vit-large-patch14",
+                cache_dir=cache_dir,
+                local_files_only=False
+            )
+            logger.info(f"CLIP model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to load CLIP model: {str(e)}")
+            raise RuntimeError(f"Model loading failed: {str(e)}")
+    def is_supported_format(self, image_url: str) -> bool:
+        """Check if image format is supported by PIL/CLIP"""
+        unsupported_extensions = ['.avif', '.heic', '.heif']
+        url_lower = image_url.lower()
+        return not any(url_lower.endswith(ext) for ext in unsupported_extensions)
+    def detect_image_format(self, content: bytes) -> str:
+        """Detect actual image format from content"""
+        try:
+            # Check for AVIF signature
+            if content.startswith(b'\\x00\\x00\\x00') and b'ftypavif' in content[:32]:
+                return 'AVIF'
+            # Check for HEIC signature
+            elif content.startswith(b'\\x00\\x00\\x00') and b'ftyp' in content[:32] and (b'heic' in content[:32] or b'heix' in content[:32]):
+                return 'HEIC'
+            # Check for WebP
+            elif content.startswith(b'RIFF') and b'WEBP' in content[:12]:
+                return 'WebP'
+            # Check for PNG
+            elif content.startswith(b'\\x89PNG\\r\\n\\x1a\\n'):
+                return 'PNG'
+            # Check for JPEG
+            elif content.startswith(b'\\xff\\xd8\\xff'):
+                return 'JPEG'
+            # Check for GIF
+            elif content.startswith((b'GIF87a', b'GIF89a')):
+                return 'GIF'
+            else:
+                return 'Unknown'
+        except:
+            return 'Unknown'
+    def encode_image(self, image_url: str) -> list:
+        try:
+            logger.info(f"Processing image: {image_url}")
+            # Quick URL-based format check first
+            if not self.is_supported_format(image_url):
+                logger.warning(f"Unsupported format detected from URL: {image_url}")
+                raise HTTPException(status_code=422, detail="Unsupported image format (AVIF/HEIC not supported)")
+            response = requests.get(image_url, timeout=30, headers={'User-Agent': 'CLIP-Service/1.0'})
+            response.raise_for_status()
+            # Detect actual format from content
+            image_format = self.detect_image_format(response.content)
+            logger.info(f"Detected image format: {image_format}")
+            if image_format in ['AVIF', 'HEIC']:
+                logger.warning(f"Unsupported format detected: {image_format} for {image_url}")
+                raise HTTPException(status_code=422, detail=f"Unsupported image format: {image_format}")
+            try:
+                image = Image.open(io.BytesIO(response.content))
+            except Exception as e:
+                logger.error(f"PIL cannot open image {image_url}: {str(e)}")
+                if "cannot identify image file" in str(e).lower():
+                    raise HTTPException(status_code=422, detail="Unsupported or corrupted image format")
+                raise
+            if image.mode != 'RGB':
+                logger.info(f"Converting image from {image.mode} to RGB")
+                image = image.convert('RGB')
+            # Resize image if too large to avoid memory issues
+            max_size = 224  # CLIP's expected input size
+            if max(image.size) > max_size:
+                image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+            # Try multiple processor configurations
+            try:
+                # Method 1: Standard CLIP processing
+                inputs = self.processor(
+                    images=image,
+                    return_tensors="pt",
+                    do_rescale=True,
+                    do_normalize=True
+                )
+            except Exception as e1:
+                logger.warning(f"Method 1 failed: {e1}, trying method 2...")
+                try:
+                    # Method 2: With padding
+                    inputs = self.processor(
+                        images=image,
+                        return_tensors="pt",
+                        padding=True,
+                        do_rescale=True,
+                        do_normalize=True
+                    )
+                except Exception as e2:
+                    logger.warning(f"Method 2 failed: {e2}, trying method 3...")
+                    # Method 3: Manual preprocessing
+                    inputs = self.processor(
+                        images=[image],
+                        return_tensors="pt"
+                    )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                image_features = self.model.get_image_features(**inputs)
+                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            return image_features.cpu().numpy().flatten().tolist()
+        except Exception as e:
+            logger.error(f"Error encoding image {image_url}: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to encode image: {str(e)}")
+    def encode_text(self, text: str) -> list:
+        try:
+            logger.info(f"Processing text: {text[:50]}...")
+            inputs = self.processor(text=[text], return_tensors="pt", padding=True).to(self.device)
+            with torch.no_grad():
+                text_features = self.model.get_text_features(**inputs)
+                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            return text_features.cpu().numpy().flatten().tolist()
+        except Exception as e:
+            logger.error(f"Error encoding text '{text[:50]}...': {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to encode text: {str(e)}")
+# Initialize service with error handling
+logger.info("Initializing CLIP service...")
+try:
+    clip_service = CLIPService()
+    logger.info("CLIP service initialized successfully!")
+except Exception as e:
+    logger.error(f"Failed to initialize CLIP service: {str(e)}")
+    logger.error(f"Error details: {type(e).__name__}: {str(e)}")
+    clip_service = None
+class ImageRequest(BaseModel):
+    image_url: str
+class TextRequest(BaseModel):
+    text: str
+@app.get("/")
+async def root():
+    return {
+        "message": "CLIP Service API",
+        "version": "1.0.0",
+        "model": "clip-vit-large-patch14",
+        "endpoints": ["/encode/image", "/encode/text", "/health"],
+        "status": "ready" if clip_service else "error"
+    }
+@app.post("/encode/image")
+async def encode_image(request: ImageRequest):
+    if not clip_service:
+        raise HTTPException(status_code=503, detail="CLIP service not available")
+    embedding = clip_service.encode_image(request.image_url)
+    return {"embedding": embedding, "dimensions": len(embedding)}
+@app.post("/encode/text")
+async def encode_text(request: TextRequest):
+    if not clip_service:
+        raise HTTPException(status_code=503, detail="CLIP service not available")
+    embedding = clip_service.encode_text(request.text)
+    return {"embedding": embedding, "dimensions": len(embedding)}
+@app.get("/health")
+async def health_check():
+    if not clip_service:
+        return {
+            "status": "unhealthy",
+            "model": "clip-vit-large-patch14",
+            "error": "Service failed to initialize"
+        }
+    return {
+        "status": "healthy",
+        "model": "clip-vit-large-patch14",
+        "device": clip_service.device,
+        "service": "ready",
+        "cache_dir": cache_dir
+    }
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))  # Hugging Face uses port 7860
+    uvicorn.run(app, host="0.0.0.0", port=port)

app.py CHANGED Viewed

@@ -9,13 +9,8 @@ try:
     CLAP_AVAILABLE = True
     CLAP_METHOD = "transformers"
 except ImportError as e1:
-    try:
-        import laion_clap
-        CLAP_AVAILABLE = True
-        CLAP_METHOD = "laion"
-    except ImportError as e2:
-        CLAP_AVAILABLE = False
-        CLAP_METHOD = None
 import torch
 from PIL import Image
 import requests
@@ -77,33 +72,31 @@ class CLIPService:
     def _load_clap_model(self):
         """Load CLAP model on demand"""
         if not CLAP_AVAILABLE:
-            raise RuntimeError("CLAP model not available")
         if self.clap_model is None:
             logger.info(f"Loading CLAP model on demand using {CLAP_METHOD} method...")
             try:
                 if CLAP_METHOD == "transformers":
                     self.clap_model = ClapModel.from_pretrained(
                         "laion/clap-htsat-unfused",
                         cache_dir=cache_dir,
                         local_files_only=False
                     ).to(self.device)
                     self.clap_processor = ClapProcessor.from_pretrained(
                         "laion/clap-htsat-unfused",
                         cache_dir=cache_dir,
                         local_files_only=False
                     )
-                elif CLAP_METHOD == "laion":
-                    # Use the official LAION CLAP library
-                    self.clap_model = laion_clap.CLAP_Module(enable_fusion=False)
-                    self.clap_model.load_ckpt()  # Load the default checkpoint
                 logger.info(f"CLAP model loaded successfully on {self.device} using {CLAP_METHOD}")
             except Exception as e:
                 logger.error(f"Failed to load CLAP model: {str(e)}")
                 raise RuntimeError(f"CLAP model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
@@ -255,36 +248,20 @@ class CLIPService:
                 if len(audio_array) > max_length:
                     audio_array = audio_array[:max_length]
-                # Process with CLAP based on method
-                if CLAP_METHOD == "transformers":
-                    inputs = self.clap_processor(
-                        audios=audio_array,
-                        sampling_rate=48000,
-                        return_tensors="pt"
-                    )
-                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                    with torch.no_grad():
-                        audio_features = self.clap_model.get_audio_features(**inputs)
-                        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
-                    return audio_features.cpu().numpy().flatten().tolist()
-                elif CLAP_METHOD == "laion":
-                    # Use LAION CLAP library
-                    with torch.no_grad():
-                        audio_features = self.clap_model.get_audio_embedding_from_data(
-                            x=audio_array,
-                            use_tensor=True
-                        )
-                        # Normalize embedding
-                        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
-                    return audio_features.cpu().numpy().flatten().tolist()
-                else:
-                    raise RuntimeError(f"Unknown CLAP method: {CLAP_METHOD}")
             finally:
                 # Clean up temp file

     CLAP_AVAILABLE = True
     CLAP_METHOD = "transformers"
 except ImportError as e1:
+    CLAP_AVAILABLE = False
+    CLAP_METHOD = None
 import torch
 from PIL import Image
 import requests
     def _load_clap_model(self):
         """Load CLAP model on demand"""
         if not CLAP_AVAILABLE:
+            raise RuntimeError("CLAP model not available - transformers version may not support CLAP")
         if self.clap_model is None:
             logger.info(f"Loading CLAP model on demand using {CLAP_METHOD} method...")
             try:
                 if CLAP_METHOD == "transformers":
+                    logger.info("Loading CLAP model from HuggingFace...")
                     self.clap_model = ClapModel.from_pretrained(
                         "laion/clap-htsat-unfused",
                         cache_dir=cache_dir,
                         local_files_only=False
                     ).to(self.device)
+                    logger.info("Loading CLAP processor...")
                     self.clap_processor = ClapProcessor.from_pretrained(
                         "laion/clap-htsat-unfused",
                         cache_dir=cache_dir,
                         local_files_only=False
                     )
                 logger.info(f"CLAP model loaded successfully on {self.device} using {CLAP_METHOD}")
             except Exception as e:
                 logger.error(f"Failed to load CLAP model: {str(e)}")
+                logger.error(f"Error type: {type(e).__name__}")
                 raise RuntimeError(f"CLAP model loading failed: {str(e)}")
     def is_supported_format(self, image_url: str) -> bool:
                 if len(audio_array) > max_length:
                     audio_array = audio_array[:max_length]
+                # Process with CLAP using transformers method
+                inputs = self.clap_processor(
+                    audios=audio_array,
+                    sampling_rate=48000,
+                    return_tensors="pt"
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    audio_features = self.clap_model.get_audio_features(**inputs)
+                    audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
+                return audio_features.cpu().numpy().flatten().tolist()
             finally:
                 # Clean up temp file

requirements-simple.txt CHANGED Viewed

@@ -1,7 +1,9 @@
-torch>=2.0.0
-transformers>=4.30.0
-Pillow>=9.0.0
-requests>=2.28.0
-fastapi>=0.104.0
-uvicorn[standard]>=0.22.0
-python-multipart>=0.0.6

+torch>=2.1.0
+transformers==4.30.0
+Pillow==9.5.0
+requests==2.31.0
+fastapi==0.104.1
+uvicorn==0.22.0
+python-multipart==0.0.6
+pydantic==2.5.0
+numpy<2.0.0

requirements.txt CHANGED Viewed

@@ -1,13 +1,12 @@
-torch==2.0.1
-transformers>=4.35.0
-Pillow==9.5.0
-requests==2.31.0
-fastapi==0.104.1
-uvicorn==0.22.0
-python-multipart==0.0.6
-pydantic==2.5.0
 numpy<2.0.0
 librosa>=0.10.0
 soundfile>=0.12.1
-datasets>=2.14.0
-laion-clap

+torch>=2.1.0
+transformers>=4.40.0
+Pillow>=9.5.0
+requests>=2.31.0
+fastapi>=0.104.1
+uvicorn>=0.22.0
+python-multipart>=0.0.6
+pydantic>=2.5.0
 numpy<2.0.0
 librosa>=0.10.0
 soundfile>=0.12.1
+datasets>=2.14.0