Spaces:

MaNmAxImO
/

Politometro

Running

App Files Files Community

Politometro / data_processing.py

MaNmAxImO

Update data_processing.py

cf6ca5c verified 4 days ago

raw

history blame contribute delete

2.99 kB

	import os
	import json
	import faiss
	import numpy as np
	from PyPDF2 import PdfReader
	from sentence_transformers import SentenceTransformer

	def extract_text_from_pdf(pdf_path):
	try:
	reader = PdfReader(pdf_path)
	if not reader.pages:
	print(f"O arquivo {pdf_path} não contém páginas.")
	return ""

	text = ""
	for i, page in enumerate(reader.pages):
	try:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	else:
	# Usa OCR para extrair texto
	images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
	for image in images:
	text += pytesseract.image_to_string(image, lang='por')
	except Exception as e:
	print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}")
	return text
	except Exception as e:
	print(f"Erro ao abrir o arquivo {pdf_path}: {e}")
	return ""

	def extract_text_from_txt(txt_path):
	with open(txt_path, "r", encoding="utf-8") as f:
	return f.read()

	def process_documents(directory):
	documents = []
	for root, _, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	if file.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif file.endswith(".txt"):
	text = extract_text_from_txt(file_path)
	else:
	continue

	document = {
	"filename": file,
	"content": text,
	"path": file_path,
	"directory": root
	}
	documents.append(document)
	return documents

	def save_documents_to_json(documents, output_file):
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(documents, f, ensure_ascii=False, indent=4)

	def load_documents(json_file):
	with open(json_file, "r", encoding="utf-8") as f:
	return json.load(f)

	def create_faiss_index(documents):
	model = SentenceTransformer('neuralmind/bert-base-portuguese-cased')
	embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True)
	index = faiss.IndexFlatL2(embeddings.size()[1])
	index.add(np.array(embeddings))
	return index, model

	def save_index(index, file_path):
	faiss.write_index(index, file_path)

	def load_index(file_path):
	return faiss.read_index(file_path)

	if __name__ == "__main__":
	input_directory = "data" # Pasta raiz com os documentos
	output_json = "data/documents.json" # Arquivo JSON final
	output_index = "models/faiss_index.pkl" # Índice FAISS

	documents = process_documents(input_directory)
	save_documents_to_json(documents, output_json)

	index, _ = create_faiss_index(documents)
	save_index(index, output_index)