import json import hashlib import os import platform import shutil import subprocess import tarfile import tempfile import time from collections import OrderedDict from pathlib import Path from typing import Dict, Optional, List, Tuple, Generator, Any import requests from config import settings class LRUCache: """Cache LRU simple y thread-safe para respuestas de Ollama.""" def __init__(self, max_size: int = 128): from threading import Lock self.max_size = max_size self._lock = Lock() self._data: "OrderedDict[str, str]" = OrderedDict() def get(self, key: str) -> Optional[str]: with self._lock: if key in self._data: self._data.move_to_end(key) return self._data[key] return None def set(self, key: str, value: str) -> None: with self._lock: self._data[key] = value self._data.move_to_end(key) if len(self._data) > self.max_size: self._data.popitem(last=False) response_cache = LRUCache(settings.CACHE_MAX_ITEMS) EMBEDDED_OLLAMA_DIR = Path.home() / ".local" / "ollama-lite" EMBEDDED_BIN = EMBEDDED_OLLAMA_DIR / "bin" / "ollama" EMBEDDED_LIB_DIR = EMBEDDED_OLLAMA_DIR / "lib" / "ollama" _OLLAMA_BIN_CACHE: Optional[str] = None def _ollama_url() -> str: return settings.OLLAMA_URL def _arch_slug() -> Optional[str]: machine = platform.machine().lower() if machine in ("x86_64", "amd64"): return "amd64" if machine in ("arm64", "aarch64"): return "arm64" return None def _ensure_embedded_ollama() -> Optional[str]: """ Descarga un binario portátil de Ollama si no existe y devuelve la ruta al ejecutable. """ global _OLLAMA_BIN_CACHE if _OLLAMA_BIN_CACHE: return _OLLAMA_BIN_CACHE existing = shutil.which("ollama") if existing: _OLLAMA_BIN_CACHE = existing return existing if EMBEDDED_BIN.exists(): EMBEDDED_BIN.chmod(0o755) _OLLAMA_BIN_CACHE = str(EMBEDDED_BIN) os.environ.setdefault("PATH", "") os.environ["PATH"] = f"{EMBEDDED_BIN.parent}:{os.environ['PATH']}" _inject_ld_library_path() return _OLLAMA_BIN_CACHE arch = _arch_slug() if not arch: return None EMBEDDED_OLLAMA_DIR.mkdir(parents=True, exist_ok=True) bundle_url = f"https://github.com/ollama/ollama/releases/latest/download/ollama-linux-{arch}.tgz" tmp_fd, tmp_path = tempfile.mkstemp(prefix="ollama_bundle_", suffix=".tgz") os.close(tmp_fd) try: print(f"📥 Descargando Ollama portátil ({arch})...") with requests.get(bundle_url, stream=True, timeout=(30, 120)) as resp: resp.raise_for_status() with open(tmp_path, "wb") as bundle: for chunk in resp.iter_content(chunk_size=1024 * 1024): if chunk: bundle.write(chunk) print("📦 Extrayendo Ollama portátil...") with tarfile.open(tmp_path, mode="r:gz") as tar: members = [ m for m in tar.getmembers() if m.name.startswith("bin/") or m.name.startswith("lib/") ] tar.extractall(path=EMBEDDED_OLLAMA_DIR, members=members) # Elimina librerías CUDA para ahorrar espacio en entornos CPU if EMBEDDED_LIB_DIR.exists(): for cuda_dir in EMBEDDED_LIB_DIR.glob("cuda_*"): shutil.rmtree(cuda_dir, ignore_errors=True) EMBEDDED_BIN.chmod(0o755) _OLLAMA_BIN_CACHE = str(EMBEDDED_BIN) os.environ["PATH"] = f"{EMBEDDED_BIN.parent}:{os.environ.get('PATH', '')}" _inject_ld_library_path() return _OLLAMA_BIN_CACHE except Exception as exc: print(f"❌ No se pudo instalar Ollama portátil: {exc}") return None finally: try: os.remove(tmp_path) except OSError: pass def _inject_ld_library_path(): current = os.environ.get("LD_LIBRARY_PATH", "") lib_path = str(EMBEDDED_LIB_DIR) if lib_path not in current.split(":"): prefix = f"{lib_path}:" if current else lib_path os.environ["LD_LIBRARY_PATH"] = f"{prefix}{current}" def _ollama_command() -> Optional[str]: cmd = _ensure_embedded_ollama() return cmd def verify() -> str: try: r = requests.get(f"{_ollama_url()}/api/version", timeout=2) r.raise_for_status() v = r.json().get("version", "?") return f"✅ Ollama en ejecución (v{v})" except requests.exceptions.RequestException: return "❌ Ollama no responde. Inicia Ollama primero." except Exception as e: return f"❌ Error al verificar Ollama: {e}" def ensure_ollama_running() -> bool: try: r = requests.get(f"{_ollama_url()}/api/version", timeout=2) return r.ok except requests.exceptions.RequestException: return False except Exception: return False def start_ollama() -> str: """Intenta iniciar Ollama usando subprocess y polling (hasta 5s).""" if ensure_ollama_running(): return "✅ Ollama ya está en ejecución." ollama_cmd = _ollama_command() if not ollama_cmd: return "❌ No se encontró el binario de Ollama y no se pudo descargar automáticamente." # Intento con systemctl (común en Linux) try: subprocess.run( ["systemctl", "--user", "start", "ollama"], capture_output=True, check=False, text=True ) time.sleep(2) if ensure_ollama_running(): return verify() except FileNotFoundError: pass # systemctl no disponible # Fallback a 'ollama serve' en segundo plano try: env = os.environ.copy() _inject_ld_library_path() env["LD_LIBRARY_PATH"] = os.environ.get("LD_LIBRARY_PATH", "") subprocess.Popen( [ollama_cmd, "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env, ) for _ in range(60): if ensure_ollama_running(): return verify() time.sleep(1) except FileNotFoundError: return "❌ No se encontró el binario de Ollama. Instálalo primero." except Exception as e: return f"❌ Error al intentar 'ollama serve': {e}" return "❌ No se pudo iniciar Ollama. Por favor, inícialo manualmente." def list_models() -> List[str]: """Lista modelos potentes de HF y los locales del usuario.""" # Lista curada de modelos potentes y recomendados hf_suggestions = [ "hf.co/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF", "hf.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M", "hf.co/meta-llama/Meta-Llama-3-8B-Instruct-GGUF", "hf.co/arcee-ai/SuperNova-Medius-GGUF", "hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF", "llama3.1:8b", # Modelo local común "llama3:8b", ] local_models = [] try: r = requests.get(f"{_ollama_url()}/api/tags", timeout=3) if r.ok: models_data = r.json().get("models", []) local_models = [m.get("model") for m in models_data if m.get("model")] except requests.exceptions.RequestException: # Ollama puede no estar corriendo, devolvemos solo las sugerencias pass # Combinar y eliminar duplicados, manteniendo el orden combined_models = [] seen = set() for model in hf_suggestions + local_models: if model not in seen: combined_models.append(model) seen.add(model) return combined_models def _is_hf_model(model_name: str) -> bool: """Detecta si es un modelo de Hugging Face Hub""" return model_name.startswith("hf.co/") or model_name.startswith("huggingface.co/") def _sanitize_model_name(name: str) -> Optional[str]: """Valida nombres de modelos incluyendo formato HF""" import re # Permitir formato hf.co/user/repo y hf.co/user/repo:quant if _is_hf_model(name): pattern = r"^(hf\.co|huggingface\.co)/[\w.-]+/[\w.-]+(:[A-Za-z0-9_.-]+)?$" if re.match(pattern, name): return name return None # Validación normal para modelos locales if re.fullmatch(r"[A-Za-z0-9:._/-]+", name): return name return None def is_model_available(name: str) -> bool: if not ensure_ollama_running(): return False try: r = requests.get(f"{_ollama_url()}/api/tags", timeout=5) r.raise_for_status() models = [m.get("model") for m in r.json().get("models", []) if m.get("model")] return name in models except requests.exceptions.RequestException: return False except Exception: return False def pull_model_with_progress(model_name: str): if not ensure_ollama_running(): yield "❌ Ollama no está corriendo. Inícialo primero." return safe = _sanitize_model_name(model_name) if not safe: yield "⚠️ Nombre de modelo inválido." return if _is_hf_model(safe): yield f"📦 Descargando modelo GGUF desde Hugging Face: {safe}" try: r = requests.post( f"{_ollama_url()}/api/pull", json={"name": safe}, stream=True, timeout=1800, # 30 minutos de timeout para modelos grandes ) r.raise_for_status() for line in r.iter_lines(): if not line: continue try: data = json.loads(line.decode("utf-8")) status = data.get("status", "") if "total" in data and "completed" in data: total = data["total"] completed = data["completed"] if total > 0: pct = int(completed / total * 100) yield f"📥 {status}: {pct}%" else: yield f"📥 {status}" except json.JSONDecodeError: continue yield f"✅ Modelo {safe} descargado correctamente" except requests.exceptions.RequestException as e: yield f"⚠️ Error de red al descargar: {e}" except Exception as e: yield f"⚠️ Error inesperado: {e}" def ask_ollama_stream( model: str, system_prompt: str, history: List[Tuple[Optional[str], Optional[str]]], new_prompt: str, temperature: float, top_p: float, max_tokens: int, ) -> Generator[str, None, None]: """Hace un chat streaming a Ollama, usando contexto multi-turno y caché LRU.""" messages: List[Dict[str, str]] = [{"role": "system", "content": system_prompt}] ctx_n = settings.CONTEXT_HISTORY_TURNS for user_msg, bot_msg in history[-ctx_n:]: if user_msg: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": new_prompt}) payload: Dict[str, Any] = { "model": model, "messages": messages, "options": { "temperature": float(temperature), "top_p": float(top_p), "num_predict": int(max_tokens), }, "stream": True, } cache_key = hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest() cached = response_cache.get(cache_key) if settings.CACHE_RESPONSES and cached: yield cached return accumulated = "" try: with requests.post( f"{_ollama_url()}/api/chat", json=payload, stream=True, timeout=300 ) as r: r.raise_for_status() for line in r.iter_lines(): if not line: continue try: data = json.loads(line.decode("utf-8")) content = data.get("message", {}).get("content", "") if content: accumulated += content yield content except json.JSONDecodeError: continue if settings.CACHE_RESPONSES and accumulated: response_cache.set(cache_key, accumulated) except requests.exceptions.RequestException as e: yield f"\n\n⚠️ Error de red: {e}" except Exception as e: yield f"\n\n⚠️ Error de conexión: {e}"