|
|
import io |
|
|
import json |
|
|
import zipfile |
|
|
from pathlib import Path |
|
|
from typing import Tuple, List, Set |
|
|
|
|
|
from config import settings |
|
|
|
|
|
|
|
|
HAS_OCR = False |
|
|
HAS_PDF = False |
|
|
|
|
|
try: |
|
|
from PIL import Image |
|
|
import pytesseract |
|
|
HAS_OCR = True |
|
|
except ImportError: |
|
|
HAS_OCR = False |
|
|
|
|
|
try: |
|
|
import PyPDF2 |
|
|
HAS_PDF = True |
|
|
except ImportError: |
|
|
HAS_PDF = False |
|
|
|
|
|
CODE_EXTS = { |
|
|
".py": "python", ".js": "javascript", ".ts": "typescript", |
|
|
".java": "java", ".cs": "csharp", ".php": "php", |
|
|
".rb": "ruby", ".go": "go", ".rs": "rust", |
|
|
".c": "c", ".cpp": "cpp", ".h": "c", ".css": "css", |
|
|
".html": "html", ".htm": "html", ".sql": "sql", |
|
|
".sh": "bash", ".bash": "bash", ".yml": "yaml", |
|
|
".yaml": "yaml", ".json": "json", ".xml": "xml", |
|
|
".md": "markdown" |
|
|
} |
|
|
|
|
|
|
|
|
def guess_lang_from_name(name: str): |
|
|
return CODE_EXTS.get(Path(name).suffix.lower()) |
|
|
|
|
|
|
|
|
def guess_lang_from_content(content: str): |
|
|
if not isinstance(content, str): |
|
|
return None |
|
|
low = content.lower() |
|
|
if "def " in low or "import " in low: |
|
|
return "python" |
|
|
if "public class" in content or "System.out.println" in content: |
|
|
return "java" |
|
|
if "select " in low or "create table" in low: |
|
|
return "sql" |
|
|
if "function " in low and "console.log" in low: |
|
|
return "javascript" |
|
|
if "<html" in low: |
|
|
return "html" |
|
|
return None |
|
|
|
|
|
|
|
|
def truncate_text(txt: str) -> str: |
|
|
max_chars = settings.MAX_CHARS_PER_FILE |
|
|
if len(txt) <= max_chars: |
|
|
return txt |
|
|
return txt[:max_chars] + "\n[... archivo recortado ...]" |
|
|
|
|
|
|
|
|
def read_image_to_text(raw: bytes) -> str: |
|
|
if not HAS_OCR: |
|
|
return "[Funcionalidad OCR no disponible. Instala 'pytesseract' y 'tesseract-ocr']" |
|
|
try: |
|
|
from PIL import Image |
|
|
import pytesseract |
|
|
img = Image.open(io.BytesIO(raw)) |
|
|
text = pytesseract.image_to_string(img) |
|
|
return text.strip() or "[Imagen sin texto extraíble]" |
|
|
except Exception as e: |
|
|
return f"[Error OCR: {e}]" |
|
|
|
|
|
|
|
|
def read_pdf_to_text(raw: bytes) -> str: |
|
|
if not HAS_PDF: |
|
|
return "[Funcionalidad PDF no disponible. Instala 'PyPDF2']" |
|
|
try: |
|
|
import PyPDF2 |
|
|
reader = PyPDF2.PdfReader(io.BytesIO(raw)) |
|
|
texts = [p.extract_text() or "" for p in reader.pages] |
|
|
result = "\n".join(texts).strip() |
|
|
return result or "[PDF sin texto extraíble]" |
|
|
except Exception as e: |
|
|
return f"[Error PDF: {e}]" |
|
|
|
|
|
|
|
|
def read_zip(raw: bytes, zip_name: str) -> str: |
|
|
collected = [] |
|
|
try: |
|
|
with zipfile.ZipFile(io.BytesIO(raw)) as zf: |
|
|
for info in zf.infolist(): |
|
|
if info.is_dir(): |
|
|
continue |
|
|
inner_name = info.filename |
|
|
data = zf.read(inner_name) |
|
|
ext = Path(inner_name).suffix.lower() |
|
|
if ext in CODE_EXTS or ext in [".txt", ".md"]: |
|
|
try: |
|
|
text = data.decode("utf-8", errors="replace") |
|
|
except Exception: |
|
|
text = "[No decodificable]" |
|
|
lang = guess_lang_from_name(inner_name) or guess_lang_from_content(text) or "text" |
|
|
text = truncate_text(text) |
|
|
collected.append(f"--- {inner_name} ({lang}) ---\n{text}\n") |
|
|
except zipfile.BadZipFile: |
|
|
return f"[Error leyendo ZIP: archivo corrupto ({zip_name})]" |
|
|
except Exception as e: |
|
|
return f"[Error leyendo ZIP: {e}]" |
|
|
return "\n".join(collected) if collected else f"[ZIP {zip_name} sin archivos útiles]" |
|
|
|
|
|
|
|
|
def read_uploaded_files(files, exclude_text: str): |
|
|
if not files: |
|
|
return "", "Sin archivos", "" |
|
|
exclude: Set[str] = {x.strip() for x in exclude_text.splitlines() if x.strip()} |
|
|
parts: List[str] = [] |
|
|
preview: List[str] = [] |
|
|
total_size = 0 |
|
|
first_code = "" |
|
|
for f in files: |
|
|
name = getattr(f, "name", "archivo") |
|
|
basename = Path(name).name |
|
|
if basename in exclude or name in exclude: |
|
|
preview.append(f"🚫 {basename} (excluido)") |
|
|
continue |
|
|
try: |
|
|
f.seek(0) |
|
|
except Exception: |
|
|
pass |
|
|
raw = f.read() |
|
|
file_size = len(raw) |
|
|
total_size += file_size |
|
|
if total_size > settings.MAX_TOTAL_UPLOAD: |
|
|
preview.append("⚠️ Límite total de carga superado, se ignoró el resto.") |
|
|
break |
|
|
if file_size > settings.MAX_FILE_SIZE: |
|
|
parts.append(f"# {basename}\n[Archivo muy grande, ignorado]\n") |
|
|
preview.append(f"⚠️ {basename} (muy grande)") |
|
|
continue |
|
|
suffix = Path(basename).suffix.lower() |
|
|
if suffix == ".zip": |
|
|
content = read_zip(raw, basename) |
|
|
parts.append(f"# {basename} (zip)\n{content}\n") |
|
|
preview.append(f"📦 {basename}") |
|
|
if not first_code and content.strip(): |
|
|
first_code = content[:settings.MAX_CHARS_PER_FILE] |
|
|
elif suffix in [".png", ".jpg", ".jpeg", ".webp", ".bmp"]: |
|
|
content = read_image_to_text(raw) |
|
|
parts.append(f"# {basename} (imagen)\n{content}\n") |
|
|
preview.append(f"🖼️ {basename}") |
|
|
elif suffix == ".pdf": |
|
|
content = read_pdf_to_text(raw) |
|
|
parts.append(f"# {basename} (pdf)\n{content}\n") |
|
|
preview.append(f"📄 {basename}") |
|
|
else: |
|
|
try: |
|
|
text = raw.decode("utf-8", errors="replace") |
|
|
except Exception: |
|
|
text = "[No decodificable]" |
|
|
text = truncate_text(text) |
|
|
lang = guess_lang_from_name(basename) or guess_lang_from_content(text) or "text" |
|
|
parts.append(f"# {basename} ({lang})\n{text}\n") |
|
|
preview.append(f"📝 {basename} ({lang})") |
|
|
if not first_code and lang != "text": |
|
|
first_code = text |
|
|
return "\n".join(parts), "\n".join(preview) if preview else "Sin archivos válidos", first_code |
|
|
|