import io import json import zipfile from pathlib import Path from typing import Tuple, List, Set from config import settings # flags opcionales HAS_OCR = False HAS_PDF = False try: from PIL import Image # noqa import pytesseract # noqa HAS_OCR = True except ImportError: HAS_OCR = False try: import PyPDF2 # noqa HAS_PDF = True except ImportError: HAS_PDF = False CODE_EXTS = { ".py": "python", ".js": "javascript", ".ts": "typescript", ".java": "java", ".cs": "csharp", ".php": "php", ".rb": "ruby", ".go": "go", ".rs": "rust", ".c": "c", ".cpp": "cpp", ".h": "c", ".css": "css", ".html": "html", ".htm": "html", ".sql": "sql", ".sh": "bash", ".bash": "bash", ".yml": "yaml", ".yaml": "yaml", ".json": "json", ".xml": "xml", ".md": "markdown" } def guess_lang_from_name(name: str): return CODE_EXTS.get(Path(name).suffix.lower()) def guess_lang_from_content(content: str): if not isinstance(content, str): return None low = content.lower() if "def " in low or "import " in low: return "python" if "public class" in content or "System.out.println" in content: return "java" if "select " in low or "create table" in low: return "sql" if "function " in low and "console.log" in low: return "javascript" if " str: max_chars = settings.MAX_CHARS_PER_FILE if len(txt) <= max_chars: return txt return txt[:max_chars] + "\n[... archivo recortado ...]" def read_image_to_text(raw: bytes) -> str: if not HAS_OCR: return "[Funcionalidad OCR no disponible. Instala 'pytesseract' y 'tesseract-ocr']" try: from PIL import Image import pytesseract img = Image.open(io.BytesIO(raw)) text = pytesseract.image_to_string(img) return text.strip() or "[Imagen sin texto extraíble]" except Exception as e: return f"[Error OCR: {e}]" def read_pdf_to_text(raw: bytes) -> str: if not HAS_PDF: return "[Funcionalidad PDF no disponible. Instala 'PyPDF2']" try: import PyPDF2 reader = PyPDF2.PdfReader(io.BytesIO(raw)) texts = [p.extract_text() or "" for p in reader.pages] result = "\n".join(texts).strip() return result or "[PDF sin texto extraíble]" except Exception as e: return f"[Error PDF: {e}]" def read_zip(raw: bytes, zip_name: str) -> str: collected = [] try: with zipfile.ZipFile(io.BytesIO(raw)) as zf: for info in zf.infolist(): if info.is_dir(): continue inner_name = info.filename data = zf.read(inner_name) ext = Path(inner_name).suffix.lower() if ext in CODE_EXTS or ext in [".txt", ".md"]: try: text = data.decode("utf-8", errors="replace") except Exception: text = "[No decodificable]" lang = guess_lang_from_name(inner_name) or guess_lang_from_content(text) or "text" text = truncate_text(text) collected.append(f"--- {inner_name} ({lang}) ---\n{text}\n") except zipfile.BadZipFile: return f"[Error leyendo ZIP: archivo corrupto ({zip_name})]" except Exception as e: return f"[Error leyendo ZIP: {e}]" return "\n".join(collected) if collected else f"[ZIP {zip_name} sin archivos útiles]" def read_uploaded_files(files, exclude_text: str): if not files: return "", "Sin archivos", "" exclude: Set[str] = {x.strip() for x in exclude_text.splitlines() if x.strip()} parts: List[str] = [] preview: List[str] = [] total_size = 0 first_code = "" for f in files: name = getattr(f, "name", "archivo") basename = Path(name).name if basename in exclude or name in exclude: preview.append(f"🚫 {basename} (excluido)") continue try: f.seek(0) except Exception: pass raw = f.read() file_size = len(raw) total_size += file_size if total_size > settings.MAX_TOTAL_UPLOAD: preview.append("⚠️ Límite total de carga superado, se ignoró el resto.") break if file_size > settings.MAX_FILE_SIZE: parts.append(f"# {basename}\n[Archivo muy grande, ignorado]\n") preview.append(f"⚠️ {basename} (muy grande)") continue suffix = Path(basename).suffix.lower() if suffix == ".zip": content = read_zip(raw, basename) parts.append(f"# {basename} (zip)\n{content}\n") preview.append(f"📦 {basename}") if not first_code and content.strip(): first_code = content[:settings.MAX_CHARS_PER_FILE] elif suffix in [".png", ".jpg", ".jpeg", ".webp", ".bmp"]: content = read_image_to_text(raw) parts.append(f"# {basename} (imagen)\n{content}\n") preview.append(f"🖼️ {basename}") elif suffix == ".pdf": content = read_pdf_to_text(raw) parts.append(f"# {basename} (pdf)\n{content}\n") preview.append(f"📄 {basename}") else: try: text = raw.decode("utf-8", errors="replace") except Exception: text = "[No decodificable]" text = truncate_text(text) lang = guess_lang_from_name(basename) or guess_lang_from_content(text) or "text" parts.append(f"# {basename} ({lang})\n{text}\n") preview.append(f"📝 {basename} ({lang})") if not first_code and lang != "text": first_code = text return "\n".join(parts), "\n".join(preview) if preview else "Sin archivos válidos", first_code