Spaces:
Running
Running
import os | |
import json | |
import faiss | |
import numpy as np | |
from PyPDF2 import PdfReader | |
from sentence_transformers import SentenceTransformer | |
def extract_text_from_pdf(pdf_path): | |
try: | |
reader = PdfReader(pdf_path) | |
if not reader.pages: | |
print(f"O arquivo {pdf_path} não contém páginas.") | |
return "" | |
text = "" | |
for i, page in enumerate(reader.pages): | |
try: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text | |
else: | |
# Usa OCR para extrair texto | |
images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1) | |
for image in images: | |
text += pytesseract.image_to_string(image, lang='por') | |
except Exception as e: | |
print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}") | |
return text | |
except Exception as e: | |
print(f"Erro ao abrir o arquivo {pdf_path}: {e}") | |
return "" | |
def extract_text_from_txt(txt_path): | |
with open(txt_path, "r", encoding="utf-8") as f: | |
return f.read() | |
def process_documents(directory): | |
documents = [] | |
for root, _, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
if file.endswith(".pdf"): | |
text = extract_text_from_pdf(file_path) | |
elif file.endswith(".txt"): | |
text = extract_text_from_txt(file_path) | |
else: | |
continue | |
document = { | |
"filename": file, | |
"content": text, | |
"path": file_path, | |
"directory": root | |
} | |
documents.append(document) | |
return documents | |
def save_documents_to_json(documents, output_file): | |
with open(output_file, "w", encoding="utf-8") as f: | |
json.dump(documents, f, ensure_ascii=False, indent=4) | |
def load_documents(json_file): | |
with open(json_file, "r", encoding="utf-8") as f: | |
return json.load(f) | |
def create_faiss_index(documents): | |
model = SentenceTransformer('neuralmind/bert-base-portuguese-cased') | |
embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True) | |
index = faiss.IndexFlatL2(embeddings.size()[1]) | |
index.add(np.array(embeddings)) | |
return index, model | |
def save_index(index, file_path): | |
faiss.write_index(index, file_path) | |
def load_index(file_path): | |
return faiss.read_index(file_path) | |
if __name__ == "__main__": | |
input_directory = "data" # Pasta raiz com os documentos | |
output_json = "data/documents.json" # Arquivo JSON final | |
output_index = "models/faiss_index.pkl" # Índice FAISS | |
documents = process_documents(input_directory) | |
save_documents_to_json(documents, output_json) | |
index, _ = create_faiss_index(documents) | |
save_index(index, output_index) |