Politometro / data_processing.py
MaNmAxImO's picture
Update data_processing.py
cf6ca5c verified
import os
import json
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
def extract_text_from_pdf(pdf_path):
try:
reader = PdfReader(pdf_path)
if not reader.pages:
print(f"O arquivo {pdf_path} não contém páginas.")
return ""
text = ""
for i, page in enumerate(reader.pages):
try:
page_text = page.extract_text()
if page_text:
text += page_text
else:
# Usa OCR para extrair texto
images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
for image in images:
text += pytesseract.image_to_string(image, lang='por')
except Exception as e:
print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}")
return text
except Exception as e:
print(f"Erro ao abrir o arquivo {pdf_path}: {e}")
return ""
def extract_text_from_txt(txt_path):
with open(txt_path, "r", encoding="utf-8") as f:
return f.read()
def process_documents(directory):
documents = []
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file.endswith(".txt"):
text = extract_text_from_txt(file_path)
else:
continue
document = {
"filename": file,
"content": text,
"path": file_path,
"directory": root
}
documents.append(document)
return documents
def save_documents_to_json(documents, output_file):
with open(output_file, "w", encoding="utf-8") as f:
json.dump(documents, f, ensure_ascii=False, indent=4)
def load_documents(json_file):
with open(json_file, "r", encoding="utf-8") as f:
return json.load(f)
def create_faiss_index(documents):
model = SentenceTransformer('neuralmind/bert-base-portuguese-cased')
embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True)
index = faiss.IndexFlatL2(embeddings.size()[1])
index.add(np.array(embeddings))
return index, model
def save_index(index, file_path):
faiss.write_index(index, file_path)
def load_index(file_path):
return faiss.read_index(file_path)
if __name__ == "__main__":
input_directory = "data" # Pasta raiz com os documentos
output_json = "data/documents.json" # Arquivo JSON final
output_index = "models/faiss_index.pkl" # Índice FAISS
documents = process_documents(input_directory)
save_documents_to_json(documents, output_json)
index, _ = create_faiss_index(documents)
save_index(index, output_index)