Spaces:

zlsah
/

charmedai

Sleeping

App Files Files Community

charmedai / ingest.py

zlsah

Upload 7 files

bb84260 over 2 years ago

raw

history blame contribute delete

2.79 kB



	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path

	import weaviate
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter

	os.environ["OPENAI_API_KEY"] = "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"

	def clean_data(data):
	soup = BeautifulSoup(data, features = "lxml")
	text = soup.find_all("main", {"id": "main-content"})[0].get_text()
	return "\n".join([t for t in text.split("\n") if t])


	docs = []
	metadatas = []
	for p in Path("https://textworld.readthedocs.io/en/latest/").rglob("*"): # to enrich gameplay and quest generation
	if p.is_dir():
	continue
	with open(p) as f:
	docs.append(clean_data(f.read()))
	print('.. DOCS')
	metadatas.append({"source": p})


	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)
	print('documents', documents)

	WEAVIATE_URL = "https://tro.weaviate.network/"
	#WEAVIATE_URL = os.environ["WEAVIATE_URL"]

	client = weaviate.Client(
	url=WEAVIATE_URL,
	additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
	)

	# text2vec DB

	client.schema.get()
	schema = {
	"classes": [
	{
	"class": "Paragraphs",
	"description": "A written paragraph",
	"vectorizer": "text2vec-openai",
	"moduleConfig": {
	"text2vec-openai": {
	"model": "ada",
	"modelVersion": "002",
	"type": "text",
	}
	},
	"properties": [
	{
	"dataType": ["text"],
	"description": "The content of the paragraph",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": False,
	"vectorizePropertyName": False,
	}
	},
	"name": "content",
	},
	{
	"dataType": ["text"],
	"description": "The link",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": True,
	"vectorizePropertyName": False,
	}
	},
	"name": "source",
	},
	],
	},
	]
	}

	client.schema.create(schema)

	with client.batch as batch:
	for text in documents:
	batch.add_data_object(
	{"content": text.page_content, "source": str(text.metadata["source"])},
	"Paragraphs",
	)