Spaces:
Paused
Paused
update again with requiriments.txt
Browse files- Dockerfile +6 -21
- README.md +236 -11
- app.py +80 -0
- classifier.py +174 -0
- inference.py +50 -0
- model.py +63 -0
- requirements.txt +8 -8
- types_io.py +96 -0
Dockerfile
CHANGED
@@ -1,24 +1,6 @@
|
|
1 |
# Use Python 3.13 slim image as base
|
2 |
FROM python:3.10
|
3 |
|
4 |
-
# Set working directory
|
5 |
-
WORKDIR /app
|
6 |
-
|
7 |
-
# Install pip and build tools
|
8 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
9 |
-
build-essential \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
11 |
-
|
12 |
-
# install uv
|
13 |
-
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
14 |
-
|
15 |
-
# Copy the application into the container
|
16 |
-
COPY pyproject.toml uv.lock ./
|
17 |
-
COPY src/ ./src/
|
18 |
-
|
19 |
-
# Install the application dependencies
|
20 |
-
RUN uv sync --frozen --no-cache
|
21 |
-
|
22 |
# Set up a new user named 'user'
|
23 |
RUN useradd user
|
24 |
|
@@ -31,9 +13,12 @@ ENV HOME=/home/user \
|
|
31 |
# Set the working directory to the user's home directory
|
32 |
WORKDIR $HOME/app
|
33 |
|
34 |
-
|
35 |
-
COPY --chown=user . $HOME/app
|
36 |
|
|
|
37 |
|
38 |
# Set the default command to run the FastAPI app with Uvicorn
|
39 |
-
CMD ["
|
|
|
|
|
|
|
|
1 |
# Use Python 3.13 slim image as base
|
2 |
FROM python:3.10
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
# Set up a new user named 'user'
|
5 |
RUN useradd user
|
6 |
|
|
|
13 |
# Set the working directory to the user's home directory
|
14 |
WORKDIR $HOME/app
|
15 |
|
16 |
+
COPY --chown=user ./ $HOME/app
|
|
|
17 |
|
18 |
+
RUN pip install -r requirements.txt
|
19 |
|
20 |
# Set the default command to run the FastAPI app with Uvicorn
|
21 |
+
CMD ["uvicorn", "src.kimi_service.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
22 |
+
|
23 |
+
|
24 |
+
|
README.md
CHANGED
@@ -1,11 +1,236 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<h1>Clasificación del Uso Predial Usando Imágenes de GSV y Modelos de Lenguaje Visual</h1>
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<img src="https://img.shields.io/github/issues/chansoopark98/Tensorflow-Keras-Realtime-Segmentation">
|
5 |
+
<img src="https://img.shields.io/github/forks/chansoopark98/Tensorflow-Keras-Realtime-Segmentation">
|
6 |
+
<img src="https://img.shields.io/github/stars/chansoopark98/Tensorflow-Keras-Realtime-Segmentation">
|
7 |
+
<img src="https://img.shields.io/github/license/chansoopark98/Tensorflow-Keras-Realtime-Segmentation">
|
8 |
+
</p>
|
9 |
+
|
10 |
+
<br>
|
11 |
+
|
12 |
+
<p align="justify">
|
13 |
+
<img alt="Python" src ="https://img.shields.io/badge/Python-3.13-3776AB.svg?&style=for-the-badge&logo=Python&logoColor=white"/>
|
14 |
+
<img src ="https://img.shields.io/badge/Docker-0db7ed.svg?&style=for-the-badge&logo=Docker&logoColor=white"/>
|
15 |
+
<img src ="https://img.shields.io/badge/HUggingface-EB8F00.svg?&style=for-the-badge&logo=Huggingface&logoColor=white"/>
|
16 |
+
<img src ="https://img.shields.io/badge/JUpyter-5C3EE8.svg?&style=for-the-badge&logo=JUpyter&logoColor=white"/>
|
17 |
+
<img src ="https://img.shields.io/badge/Numpy-013243.svg?&style=for-the-badge&logo=Numpy&logoColor=white"/>
|
18 |
+
<br>
|
19 |
+
<br>
|
20 |
+
</p>
|
21 |
+
|
22 |
+
<p align="justify">
|
23 |
+
Este repositorio contiene un modelo de lenguaje visual que automatiza la clasificación del uso del suelo en Bogotá utilizando imágenes de Google Street View. El modelo utiliza una arquitectura de lenguaje visual avanzada (Kimi-VL) que no solo clasifica las imágenes en diferentes categorías de uso del suelo, sino que también proporciona un razonamiento detallado sobre su decisión.
|
24 |
+
</p>
|
25 |
+
|
26 |
+
# Contenido
|
27 |
+
|
28 |
+
1. Requerimientos
|
29 |
+
2. Instalacion
|
30 |
+
3. Uso
|
31 |
+
- 3.1. Entrenamiento del Modelo
|
32 |
+
- 3.2. Despliegue del Modelo
|
33 |
+
5. Dataset
|
34 |
+
6. Arquitectura del Modelo
|
35 |
+
7. Azure ML Deployment
|
36 |
+
8. Uso del API
|
37 |
+
|
38 |
+
## Estructura del Proyecto
|
39 |
+
|
40 |
+
```
|
41 |
+
├── data/ # solo visible en local por razones legales
|
42 |
+
│ └── minisample_v1/
|
43 |
+
│ ├── bodegas/
|
44 |
+
│ ├── comerciales1-5/
|
45 |
+
│ ├── dotacionales1/
|
46 |
+
│ ├── mixto1-3/
|
47 |
+
│ ├── otros/
|
48 |
+
│ └── residenciales/
|
49 |
+
├── src/
|
50 |
+
│ ├── data/ # Utilidades para carga de datos
|
51 |
+
│ ├── model/ # Código para entrenamiento del modelo
|
52 |
+
│ ├── deploy_to_azure.py # Script de despliegue en Azure ML
|
53 |
+
│ └── score.py # Script de inferencia del modelo
|
54 |
+
├── pyproject.toml # Dependencias del proyecto
|
55 |
+
└── README.md # Este archivo
|
56 |
+
|
57 |
+
```
|
58 |
+
|
59 |
+
## 1. Requirements
|
60 |
+
|
61 |
+
- Python >= 3.13
|
62 |
+
- Dependencias especificadas en pyproject.toml:
|
63 |
+
- PyTorch
|
64 |
+
- Transformers
|
65 |
+
- Azure ML SDK
|
66 |
+
- Otras librerías para procesamiento de datos
|
67 |
+
|
68 |
+
## 2. Instalación
|
69 |
+
|
70 |
+
1. Clona el repositorio:
|
71 |
+
```bash
|
72 |
+
git clone <repository-url>
|
73 |
+
cd bogota-land-use
|
74 |
+
```
|
75 |
+
|
76 |
+
2. Instala las dependencias usando uv:
|
77 |
+
```bash
|
78 |
+
uv venv
|
79 |
+
source .venv/bin/activate
|
80 |
+
uv pip install -e .
|
81 |
+
```
|
82 |
+
|
83 |
+
## 3. Uso
|
84 |
+
|
85 |
+
### 3.1 Evaluación del Modelo
|
86 |
+
|
87 |
+
Para evaluar imágenes usando el modelo:
|
88 |
+
|
89 |
+
```bash
|
90 |
+
python src/model/train.py --image_paths path/to/image1.jpg path/to/image2.jpg
|
91 |
+
```
|
92 |
+
|
93 |
+
El script realizará lo siguiente:
|
94 |
+
1. Cargar las imágenes especificadas
|
95 |
+
2. Utilizar el modelo Kimi-VL para analizar cada imagen
|
96 |
+
3. Proporcionar una clasificación detallada que incluye:
|
97 |
+
- Categoría asignada
|
98 |
+
- Nivel de confianza (Alto/Medio/Bajo)
|
99 |
+
- Razonamiento paso a paso
|
100 |
+
- Categorías alternativas relevantes
|
101 |
+
4. Guardar los resultados en formato JSON
|
102 |
+
|
103 |
+
### 3.2 Despliegue del Modelo
|
104 |
+
|
105 |
+
Para desplegar el modelo entrenado en Azure ML:
|
106 |
+
|
107 |
+
1. Configura las credenciales de Azure:
|
108 |
+
|
109 |
+
```bash
|
110 |
+
export AZURE_SUBSCRIPTION_ID="your-subscription-id"
|
111 |
+
export AZURE_RESOURCE_GROUP="your-resource-group"
|
112 |
+
export AZURE_ML_WORKSPACE="your-workspace-name"
|
113 |
+
```
|
114 |
+
|
115 |
+
2. Despliega el modelo:
|
116 |
+
```bash
|
117 |
+
python src/deploy_to_azure.py
|
118 |
+
```
|
119 |
+
|
120 |
+
## 4. Dataset y Categorías
|
121 |
+
|
122 |
+
El modelo clasifica las imágenes en 20 categorías detalladas:
|
123 |
+
|
124 |
+
1. **RESIDENCIALES**: Viviendas, casas, edificios PH's y condominios
|
125 |
+
2. **COMERCIALES 1**: Locales comerciales y establecimientos de compra-venta
|
126 |
+
3. **COMERCIALES 2**: Oficinas y servicios empresariales
|
127 |
+
4. **COMERCIALES 3**: Actividades artesanales y transformación local
|
128 |
+
5. **COMERCIALES 4**: Hoteles, moteles y restaurantes
|
129 |
+
6. **COMERCIALES 5**: Oficinas operativas y depósitos
|
130 |
+
7. **CENTROS COMERCIALES**: Complejos comerciales y malls
|
131 |
+
8. **BODEGAS**: Instalaciones de almacenamiento y uso industrial
|
132 |
+
9. **PARQUEADEROS**: Estacionamientos
|
133 |
+
10. **DOTACIONALES 1**: Servicios comunitarios
|
134 |
+
11. **DOTACIONALES 2**: Instituciones educativas
|
135 |
+
12. **DOTACIONALES 3**: Centros de salud y hospitales
|
136 |
+
13. **DOTACIONALES 4**: Edificios religiosos
|
137 |
+
14. **DOTACIONALES 5**: Instalaciones recreativas y culturales
|
138 |
+
15. **ESPECIALES**: Áreas militares, cementerios, aeropuertos
|
139 |
+
16. **MOLES**: Grandes edificaciones en construcción
|
140 |
+
17. **RURALES**: Estructuras rurales y agrícolas
|
141 |
+
18. **MIXTO 1**: Residencial con comercial tipo 1
|
142 |
+
19. **MIXTO 2**: Residencial con oficinas
|
143 |
+
20. **MIXTO 3**: Comercial con oficinas
|
144 |
+
|
145 |
+
Cada categoría está claramente definida con criterios específicos que el modelo utiliza para su análisis y clasificación.
|
146 |
+
|
147 |
+
## 5. Arquitectura del Modelo
|
148 |
+
|
149 |
+
El proyecto utiliza el modelo Kimi-VL (Visual Language) de Moonshot AI, una arquitectura avanzada de lenguaje visual que combina:
|
150 |
+
- Procesamiento de imágenes de alta calidad
|
151 |
+
- Comprensión del contexto visual detallada
|
152 |
+
- Capacidad de razonamiento paso a paso
|
153 |
+
- Generación de explicaciones en lenguaje natural
|
154 |
+
|
155 |
+
El modelo:
|
156 |
+
- Recibe imágenes a nivel de calle como entrada
|
157 |
+
- Analiza las características arquitectónicas y contextuales
|
158 |
+
- Proporciona una clasificación basada en razonamiento
|
159 |
+
- Incluye niveles de confianza y categorías alternativas
|
160 |
+
- Genera explicaciones detalladas de su decisión
|
161 |
+
|
162 |
+
## 6. Azure ML Deployment
|
163 |
+
|
164 |
+
El modelo se despliega como un endpoint online gestionado en Azure ML. El proceso de despliegue:
|
165 |
+
|
166 |
+
1. **Registro del Modelo**:
|
167 |
+
- El modelo Kimi-VL se registra en Azure ML workspace
|
168 |
+
- Se incluyen todos los archivos necesarios para inferencia
|
169 |
+
|
170 |
+
2. **Configuración del Endpoint**:
|
171 |
+
- Se crea un endpoint online con autenticación por clave
|
172 |
+
- Se configura el ambiente de ejecución con las dependencias necesarias
|
173 |
+
|
174 |
+
3. **Despliegue del Modelo**:
|
175 |
+
- Se utiliza una instancia Standard_DS3_v2 para el servicio
|
176 |
+
- El modelo procesa las imágenes en lotes para mayor eficiencia
|
177 |
+
- Incluye manejo automático de errores y logging
|
178 |
+
|
179 |
+
4. **Características del Servicio**:
|
180 |
+
- API REST para inferencia en tiempo real
|
181 |
+
- Procesamiento de imágenes en múltiples formatos
|
182 |
+
- Respuestas detalladas con razonamiento
|
183 |
+
- Escalado automático según la demanda
|
184 |
+
|
185 |
+
## 7. Uso del API
|
186 |
+
|
187 |
+
Una vez desplegado, puedes enviar peticiones HTTP POST al endpoint con imágenes:
|
188 |
+
|
189 |
+
```python
|
190 |
+
import requests
|
191 |
+
import base64
|
192 |
+
import json
|
193 |
+
|
194 |
+
def encode_image(image_path):
|
195 |
+
with open(image_path, "rb") as f:
|
196 |
+
return base64.b64encode(f.read()).decode()
|
197 |
+
|
198 |
+
# Prepare the request
|
199 |
+
image_data = encode_image("path/to/image.jpg")
|
200 |
+
data = {"image": image_data}
|
201 |
+
|
202 |
+
# Send request to endpoint
|
203 |
+
response = requests.post(
|
204 |
+
"YOUR_ENDPOINT_URL",
|
205 |
+
json=data,
|
206 |
+
headers={"Authorization": f"Bearer {api_key}"}
|
207 |
+
)
|
208 |
+
|
209 |
+
# Get prediction
|
210 |
+
result = response.json()
|
211 |
+
print(f"Categoría predicha: {result['classification']}")
|
212 |
+
print(f"Nivel de confianza: {result['confidence']}")
|
213 |
+
print("\nRazonamiento:")
|
214 |
+
for step, explanation in result['reasoning'].items():
|
215 |
+
print(f"{step}: {explanation}")
|
216 |
+
|
217 |
+
# Si hay categorías alternativas
|
218 |
+
if 'alternative_categories' in result:
|
219 |
+
print("\nCategorías alternativas relevantes:")
|
220 |
+
for category in result['alternative_categories']:
|
221 |
+
print(f"- {category}")
|
222 |
+
```
|
223 |
+
|
224 |
+
La respuesta incluye:
|
225 |
+
- La categoría asignada
|
226 |
+
- Nivel de confianza (Alto/Medio/Bajo)
|
227 |
+
- Razonamiento paso a paso
|
228 |
+
- Categorías alternativas relevantes
|
229 |
+
|
230 |
+
## Licencia
|
231 |
+
|
232 |
+
[Agrega aquí la información de tu licencia]
|
233 |
+
|
234 |
+
## Contribuidores
|
235 |
+
|
236 |
+
[Agrega aquí información sobre los contribuidores]
|
app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
from PIL import Image
|
3 |
+
import base64
|
4 |
+
|
5 |
+
from fastapi import FastAPI, APIRouter, HTTPException
|
6 |
+
from inference import Inference
|
7 |
+
import uvicorn
|
8 |
+
import logging
|
9 |
+
from typing import Optional
|
10 |
+
|
11 |
+
from types_io import ClassificationRequest, ImageData
|
12 |
+
|
13 |
+
|
14 |
+
logging.basicConfig(
|
15 |
+
level=logging.INFO,
|
16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
17 |
+
)
|
18 |
+
log = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
|
21 |
+
def decode_base64_image(base64_str: str) -> Optional[Image.Image]:
|
22 |
+
"""
|
23 |
+
Decode a base64 encoded string into a PIL Image object.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
base64_str (str): Base64 encoded image string
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Optional[Image.Image]: PIL Image object if successful, None if decoding fails
|
30 |
+
|
31 |
+
Raises:
|
32 |
+
Exception: Logged and caught internally, returns None on any error
|
33 |
+
"""
|
34 |
+
try:
|
35 |
+
image_data = base64.b64decode(base64_str)
|
36 |
+
image = Image.open(io.BytesIO(image_data))
|
37 |
+
return image
|
38 |
+
except Exception as e:
|
39 |
+
log.error(f"Error processing image: {str(e)}")
|
40 |
+
return None
|
41 |
+
|
42 |
+
|
43 |
+
app = FastAPI(title="Kimi Service", version="1.5.0")
|
44 |
+
inference = Inference()
|
45 |
+
router = APIRouter()
|
46 |
+
|
47 |
+
|
48 |
+
@app.get("/")
|
49 |
+
async def home():
|
50 |
+
return {"message": "Welcome to Kimi Service!"}
|
51 |
+
|
52 |
+
|
53 |
+
@router.post("/classify", response_model=ImageData)
|
54 |
+
async def classify(request: ClassificationRequest):
|
55 |
+
try:
|
56 |
+
log.info(f"Processing {len(request.images)} images")
|
57 |
+
# Decode images from base64 or load from file paths
|
58 |
+
images = []
|
59 |
+
for img_str in request.images:
|
60 |
+
img = decode_base64_image(img_str)
|
61 |
+
images.append(img)
|
62 |
+
log.info(f"Decoded {len(images)} images successfully")
|
63 |
+
|
64 |
+
res = inference.classify_building(images)
|
65 |
+
if res is None:
|
66 |
+
raise HTTPException(status_code=500, detail="Classification failed")
|
67 |
+
return res
|
68 |
+
|
69 |
+
except ValueError as ve:
|
70 |
+
log.error(f"Validation error: {str(ve)}")
|
71 |
+
raise HTTPException(status_code=400, detail=str(ve))
|
72 |
+
except Exception as e:
|
73 |
+
log.error(f"Error during classification: {str(e)}")
|
74 |
+
raise HTTPException(status_code=500, detail=str(e))
|
75 |
+
|
76 |
+
|
77 |
+
app.include_router(router)
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
uvicorn.run("app:app", reload=True, port=7860, host="0.0.0.0")
|
classifier.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import logging
|
3 |
+
import outlines
|
4 |
+
from model import Model
|
5 |
+
from PIL import Image
|
6 |
+
from types_io import ImageData
|
7 |
+
|
8 |
+
# Configure logging
|
9 |
+
logging.basicConfig(
|
10 |
+
level=logging.INFO,
|
11 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
12 |
+
)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
LAND_USE_PROMPT = f"""
|
17 |
+
Task: You are a structured image analysis agent. Given an image of a building front, generate a comprehensive tag list and provide the thinking process for an image classification systems.
|
18 |
+
|
19 |
+
Requirement: The output should generate a minimum of 3 categories for each input.
|
20 |
+
|
21 |
+
Confidence: Confidence score for each category, ranging from 0 (very confident) to 1 (little/none confident).
|
22 |
+
|
23 |
+
Categories :
|
24 |
+
|
25 |
+
- Residenciales: Buildings intended for housing - Houses, PH Buildings, Condominiums.
|
26 |
+
- Comerciales1: Refers to the storage, distribution, or exchange of products, goods, or services with a commercial interest.
|
27 |
+
- Comerciales2: Buildings where activities aimed at providing services are carried out.
|
28 |
+
- Comerciales3: Buildings used for artisanal activities where raw materials are transformed on a local scale.
|
29 |
+
- Comerciales4: Hotels, Motels, and Restaurants.
|
30 |
+
- Comerciales5: Operational offices and warehouses.
|
31 |
+
- Centros_Comerciales: Commercial premises located on properties of one or several buildings.
|
32 |
+
- Bodegas: Buildings in warehouse-type constructions dedicated to commercial, industrial, or storage activities.
|
33 |
+
- Parqueaderos: Buildings designed for vehicle parking.
|
34 |
+
- Dotacionales1: Buildings where activities aimed at the welfare or service of a community are carried out.
|
35 |
+
- Dotacionales2: Buildings designed to carry out educational or training activities.
|
36 |
+
- Dotacionales3: Buildings with the necessary infrastructure to provide surgical and/or hospitalization services.
|
37 |
+
- Dotacionales4: Buildings for religious worship owned by communities or religious congregations.
|
38 |
+
- Dotacionales5: Theaters, cinemas, swimming pools, museums, sports, events, or shows.
|
39 |
+
- Especiales: Military administrative areas, cemeteries, airport runways.
|
40 |
+
- Moles: Large buildings in height (>4 floors) or area (>10,000 m²), usually under construction.
|
41 |
+
- Rurales: Sheds, kiosks, shelters, barns, stables, silos, etc.
|
42 |
+
- Mixto1: (Residencial + Comercial1) Housing and commercial premises.
|
43 |
+
- Mixto2: (Residencial + Comercial2) Housing and offices.
|
44 |
+
- Mixto3: (Comercial1 + Comercial2) Commercial premises and offices.
|
45 |
+
|
46 |
+
|
47 |
+
Return the information in the following JSON schema:
|
48 |
+
{ImageData.model_json_schema()}
|
49 |
+
"""
|
50 |
+
|
51 |
+
|
52 |
+
class Classifier:
|
53 |
+
def __init__(self, MAX_NEW_TOKENS: int = 1024):
|
54 |
+
self.max_new_tokens = MAX_NEW_TOKENS
|
55 |
+
|
56 |
+
logger.info("Initializing Classifier")
|
57 |
+
logger.info("Loading model...")
|
58 |
+
self.model = Model.load_model()
|
59 |
+
logger.info("Loading processor...")
|
60 |
+
self.processor = Model.load_processor()
|
61 |
+
logger.info("Classifier initialization complete")
|
62 |
+
logger.info("Setting up image data generator...")
|
63 |
+
self.image_data_generator = outlines.Generator(self.model, ImageData)
|
64 |
+
|
65 |
+
|
66 |
+
def get_response(self, images: List[Image.Image]) -> dict:
|
67 |
+
logger.info(f"Processing classification request for {len(images)} images")
|
68 |
+
logger.info("Loading and preprocessing images...")
|
69 |
+
images = self.get_input_tensor(images)
|
70 |
+
logger.debug("Successfully preprocessed images")
|
71 |
+
|
72 |
+
logger.info("Preparing input messages...")
|
73 |
+
messages = self.prepare_messages(images, LAND_USE_PROMPT)
|
74 |
+
|
75 |
+
response = self.generate_model_response(images, messages)
|
76 |
+
|
77 |
+
return {"output": response}
|
78 |
+
|
79 |
+
def get_input_tensor(self, images: List[Image.Image]) -> List[Image.Image]:
|
80 |
+
"""
|
81 |
+
Preprocess a list of PIL images.
|
82 |
+
Args:
|
83 |
+
images (List[Image.Image]): List of PIL images to be processed.
|
84 |
+
Returns:
|
85 |
+
List[Image.Image]: List of preprocessed images ready for classification.
|
86 |
+
"""
|
87 |
+
if not images:
|
88 |
+
raise ValueError("No images provided for classification.")
|
89 |
+
|
90 |
+
logger.info(f"Preprocessing {len(images)} images...")
|
91 |
+
processed_images = []
|
92 |
+
for idx, img in enumerate(images):
|
93 |
+
logger.debug(f"Processing image at index: {idx}")
|
94 |
+
try:
|
95 |
+
img = self.resize_image(img)
|
96 |
+
processed_images.append(img)
|
97 |
+
logger.debug(f"Successfully processed image at index: {idx}")
|
98 |
+
except Exception as e:
|
99 |
+
logger.error(f"Error processing image at index {idx}: {str(e)}")
|
100 |
+
raise
|
101 |
+
return processed_images
|
102 |
+
|
103 |
+
def generate_model_response(self, images: List[Image.Image], messages: List[dict]) -> str:
|
104 |
+
"""
|
105 |
+
Generate response from the model.
|
106 |
+
Args:
|
107 |
+
images (List[Image.Image]): List of preprocessed images.
|
108 |
+
messages (List[dict]): Messages for the processor.
|
109 |
+
Returns:
|
110 |
+
str: Decoded response from the model.
|
111 |
+
"""
|
112 |
+
logger.info("Applying chat template...")
|
113 |
+
text = self.processor.apply_chat_template(
|
114 |
+
messages, add_generation_prompt=True, return_tensors="pt"
|
115 |
+
)
|
116 |
+
logger.info("Generating response with outlines...")
|
117 |
+
result = self.image_data_generator(
|
118 |
+
{
|
119 |
+
"text": text,
|
120 |
+
"images": images
|
121 |
+
},
|
122 |
+
max_new_tokens=self.max_new_tokens
|
123 |
+
)
|
124 |
+
logger.debug("Successfully generated response")
|
125 |
+
|
126 |
+
return result
|
127 |
+
|
128 |
+
@staticmethod
|
129 |
+
def resize_image(image: Image.Image, max_size: int = 224) -> Image.Image:
|
130 |
+
"""
|
131 |
+
Resize an image while maintaining aspect ratio.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
image: PIL Image object to resize
|
135 |
+
max_size: Maximum dimension (width or height) of the output image
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
PIL Image: Resized image with maintained aspect ratio
|
139 |
+
"""
|
140 |
+
# Get current dimensions
|
141 |
+
width, height = image.size
|
142 |
+
|
143 |
+
# Calculate scaling factor to fit within max_size
|
144 |
+
scale = min(max_size / width, max_size / height)
|
145 |
+
|
146 |
+
# Only resize if image is larger than max_size
|
147 |
+
if scale < 1:
|
148 |
+
new_width = int(width * scale)
|
149 |
+
new_height = int(height * scale)
|
150 |
+
image = image.resize(
|
151 |
+
(new_width, new_height),
|
152 |
+
Image.Resampling.LANCZOS
|
153 |
+
)
|
154 |
+
|
155 |
+
return image
|
156 |
+
|
157 |
+
@staticmethod
|
158 |
+
def prepare_messages(images: List[Image.Image]) -> List[dict]:
|
159 |
+
"""
|
160 |
+
Prepare messages for the processor.
|
161 |
+
Args:
|
162 |
+
images (List[Image.Image]): List of PIL images.
|
163 |
+
classification_prompt (str): The prompt for classification.
|
164 |
+
Returns:
|
165 |
+
List[dict]: Messages for the processor.
|
166 |
+
"""
|
167 |
+
return [
|
168 |
+
{
|
169 |
+
"role": "user",
|
170 |
+
"content": [
|
171 |
+
{"type": "image", "image": image} for image in images
|
172 |
+
] + [{"type": "text", "text": LAND_USE_PROMPT}],
|
173 |
+
},
|
174 |
+
]
|
inference.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from classifier import Classifier
|
2 |
+
from typing import List
|
3 |
+
from PIL import Image
|
4 |
+
import logging
|
5 |
+
|
6 |
+
logging.basicConfig(
|
7 |
+
level=logging.INFO,
|
8 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
9 |
+
)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
classifier = Classifier()
|
13 |
+
|
14 |
+
|
15 |
+
class Inference():
|
16 |
+
def __init__(self):
|
17 |
+
self.classifier = classifier
|
18 |
+
|
19 |
+
def _prepare_images(self, images: List[Image.Image]) -> List[Image.Image]:
|
20 |
+
"""
|
21 |
+
Prepare PIL images for classification by converting to RGB
|
22 |
+
Args:
|
23 |
+
images: List of PIL Image objects
|
24 |
+
Returns:
|
25 |
+
List of RGB PIL Image objects
|
26 |
+
"""
|
27 |
+
prepared_images = []
|
28 |
+
for idx, image in enumerate(images):
|
29 |
+
try:
|
30 |
+
# Convert to RGB to ensure compatibility
|
31 |
+
image = image.convert('RGB')
|
32 |
+
prepared_images.append(image)
|
33 |
+
except Exception as e:
|
34 |
+
raise ValueError(f"Error processing image {idx}: {str(e)}")
|
35 |
+
|
36 |
+
return prepared_images
|
37 |
+
|
38 |
+
def classify_building(self, images: List[Image.Image]) -> dict:
|
39 |
+
"""
|
40 |
+
Classify building type from a list of PIL Image objects
|
41 |
+
Args:
|
42 |
+
images: List of PIL Image objects
|
43 |
+
Returns:
|
44 |
+
Classification response
|
45 |
+
"""
|
46 |
+
logger.info(f"Preparing {len(images)} images for classification")
|
47 |
+
prepared_images = self._prepare_images(images)
|
48 |
+
logger.info(f"Image preparation successful")
|
49 |
+
response = self.classifier.get_response(prepared_images)
|
50 |
+
return response
|
model.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
3 |
+
import outlines
|
4 |
+
import logging
|
5 |
+
|
6 |
+
logging.basicConfig(
|
7 |
+
level=logging.INFO,
|
8 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
9 |
+
)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
class Model:
|
14 |
+
"""
|
15 |
+
Handles loading and managing the Kimi-VL-A3B-Thinking model and processor.
|
16 |
+
Loads model and processor once and provides accessors.
|
17 |
+
"""
|
18 |
+
|
19 |
+
# ALL MODEL CONFIGURATIONS
|
20 |
+
_model = None
|
21 |
+
_processor = None
|
22 |
+
MODEL_PATH = "moonshotai/Kimi-VL-A3B-Thinking-2506"
|
23 |
+
model_class = AutoModelForCausalLM
|
24 |
+
processor_class = AutoProcessor
|
25 |
+
|
26 |
+
model_kwargs = {
|
27 |
+
"device_map": "auto",
|
28 |
+
"torch_dtype": "auto",
|
29 |
+
"trust_remote_code": True
|
30 |
+
}
|
31 |
+
processor_kwargs = {
|
32 |
+
"device_map": "auto",
|
33 |
+
"trust_remote_code": True
|
34 |
+
}
|
35 |
+
|
36 |
+
@classmethod
|
37 |
+
def load(cls):
|
38 |
+
if cls._model is None:
|
39 |
+
try:
|
40 |
+
tf_model = cls.model_class.from_pretrained(
|
41 |
+
cls.MODEL_PATH, **cls.model_kwargs
|
42 |
+
)
|
43 |
+
tf_processor = cls.processor_class.from_pretrained(
|
44 |
+
cls.MODEL_PATH, **cls.processor_kwargs
|
45 |
+
)
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(f"Failed to load model or processor: {str(e)}")
|
48 |
+
raise
|
49 |
+
|
50 |
+
cls._model = outlines.from_transformers(tf_model, tf_processor)
|
51 |
+
cls._processor = tf_processor
|
52 |
+
|
53 |
+
return cls._model, cls._processor
|
54 |
+
|
55 |
+
@classmethod
|
56 |
+
def load_model(cls):
|
57 |
+
model, _ = cls.load()
|
58 |
+
return model
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
def load_processor(cls):
|
62 |
+
_, processor = cls.load()
|
63 |
+
return processor
|
requirements.txt
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
fastapi>=0.115.12
|
2 |
-
requests
|
3 |
-
uvicorn>=0.34.3
|
4 |
-
sentencepiece
|
5 |
-
torch>=2.7.0
|
6 |
-
torchvision>=0.22.0
|
7 |
datasets>=3.6.0
|
8 |
pillow>=11.2.1
|
|
|
|
|
|
|
|
|
9 |
transformers>=4.52.3
|
10 |
accelerate>=1.7.0
|
11 |
tiktoken>=0.9.0
|
12 |
blobfile>=3.0.0
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
datasets>=3.6.0
|
2 |
pillow>=11.2.1
|
3 |
+
scikit-learn>=1.6.1
|
4 |
+
torch>=2.7.0
|
5 |
+
torchaudio>=2.7.0
|
6 |
+
torchvision>=0.22.0
|
7 |
transformers>=4.52.3
|
8 |
accelerate>=1.7.0
|
9 |
tiktoken>=0.9.0
|
10 |
blobfile>=3.0.0
|
11 |
+
fastapi>=0.115.12
|
12 |
+
uvicorn[standard]>=0.34.3
|
13 |
+
outlines>=1.0.2
|
14 |
+
requests>=2.32.3
|
types_io.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from enum import Enum
|
3 |
+
from pydantic import BaseModel, Field, confloat
|
4 |
+
from pydantic.types import StringConstraints
|
5 |
+
from typing import List
|
6 |
+
from typing_extensions import Annotated
|
7 |
+
|
8 |
+
|
9 |
+
class ClassificationRequest(BaseModel):
|
10 |
+
"""
|
11 |
+
Request model for image classification.
|
12 |
+
Contains a list of base64-encoded images or file paths.
|
13 |
+
"""
|
14 |
+
images: List[str] = Field(
|
15 |
+
...,
|
16 |
+
example=[
|
17 |
+
"<base64-image-1>",
|
18 |
+
"<base64-image-2>"
|
19 |
+
]
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
class TagType(Enum):
|
24 |
+
"""
|
25 |
+
Enum for all possible building tag categories.
|
26 |
+
"""
|
27 |
+
RESIDENCIALES = "Residenciales"
|
28 |
+
COMERCIALES1 = "Comerciales1"
|
29 |
+
COMERCIALES2 = "Comerciales2"
|
30 |
+
COMERCIALES3 = "Comerciales3"
|
31 |
+
COMERCIALES4 = "Comerciales4"
|
32 |
+
COMERCIALES5 = "Comerciales5"
|
33 |
+
CENTROS_COMERCIALES = "Centros_Comerciales"
|
34 |
+
BODEGAS = "Bodegas"
|
35 |
+
PARQUEADEROS = "Parqueaderos"
|
36 |
+
DOTACIONALES1 = "Dotacionales1"
|
37 |
+
DOTACIONALES2 = "Dotacionales2"
|
38 |
+
DOTACIONALES3 = "Dotacionales3"
|
39 |
+
DOTACIONALES4 = "Dotacionales4"
|
40 |
+
DOTACIONALES5 = "Dotacionales5"
|
41 |
+
ESPECIALES = "Especiales"
|
42 |
+
MOLES = "Moles"
|
43 |
+
RURALES = "Rurales"
|
44 |
+
MIXTO1 = "Mixto1"
|
45 |
+
MIXTO2 = "Mixto2"
|
46 |
+
MIXTO3 = "Mixto3"
|
47 |
+
|
48 |
+
|
49 |
+
class ImageTag(BaseModel):
|
50 |
+
"""
|
51 |
+
Represents a single classification tag and its confidence score.
|
52 |
+
"""
|
53 |
+
category: TagType = Field(
|
54 |
+
...,
|
55 |
+
example=TagType.RESIDENCIALES
|
56 |
+
)
|
57 |
+
confidence: Annotated[
|
58 |
+
confloat(ge=0.0, le=1.0),
|
59 |
+
Field(
|
60 |
+
description=(
|
61 |
+
"Confidence score for the tag, between 0 (not confident) "
|
62 |
+
"and 1 (very confident)."
|
63 |
+
),
|
64 |
+
example=0.92
|
65 |
+
)
|
66 |
+
]
|
67 |
+
|
68 |
+
|
69 |
+
class ImageData(BaseModel):
|
70 |
+
"""
|
71 |
+
Response model for image classification results.
|
72 |
+
Contains a list of classification tags and a summary string.
|
73 |
+
"""
|
74 |
+
classification: List[ImageTag] = Field(
|
75 |
+
...,
|
76 |
+
min_items=3,
|
77 |
+
max_items=5,
|
78 |
+
description="List of classification tags for the image.",
|
79 |
+
example=[
|
80 |
+
{"category": "Residenciales", "confidence": 0.92},
|
81 |
+
{"category": "Comerciales1", "confidence": 0.65},
|
82 |
+
{"category": "Moles", "confidence": 0.33}
|
83 |
+
]
|
84 |
+
)
|
85 |
+
think: Annotated[
|
86 |
+
str,
|
87 |
+
StringConstraints(min_length=100, max_length=2048),
|
88 |
+
Field(
|
89 |
+
description="A summary of the </think> section of the output.",
|
90 |
+
example=(
|
91 |
+
"This building is primarily residential with some commercial "
|
92 |
+
"activity on the ground floor. The structure and facade "
|
93 |
+
"suggest a mixed-use property typical of urban environments."
|
94 |
+
)
|
95 |
+
)
|
96 |
+
]
|