|
|
|
|
|
"""Load html from files, clean up, split, ingest into Weaviate.""" |
|
import os |
|
from pathlib import Path |
|
|
|
import weaviate |
|
from bs4 import BeautifulSoup |
|
from langchain.text_splitter import CharacterTextSplitter |
|
|
|
os.environ["OPENAI_API_KEY"] = "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" |
|
|
|
def clean_data(data): |
|
soup = BeautifulSoup(data, features = "lxml") |
|
text = soup.find_all("main", {"id": "main-content"})[0].get_text() |
|
return "\n".join([t for t in text.split("\n") if t]) |
|
|
|
|
|
docs = [] |
|
metadatas = [] |
|
for p in Path("https://textworld.readthedocs.io/en/latest/").rglob("*"): |
|
if p.is_dir(): |
|
continue |
|
with open(p) as f: |
|
docs.append(clean_data(f.read())) |
|
print('.. DOCS') |
|
metadatas.append({"source": p}) |
|
|
|
|
|
text_splitter = CharacterTextSplitter( |
|
separator="\n", |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
length_function=len, |
|
) |
|
|
|
documents = text_splitter.create_documents(docs, metadatas=metadatas) |
|
print('documents', documents) |
|
|
|
WEAVIATE_URL = "https://tro.weaviate.network/" |
|
|
|
|
|
client = weaviate.Client( |
|
url=WEAVIATE_URL, |
|
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, |
|
) |
|
|
|
|
|
|
|
client.schema.get() |
|
schema = { |
|
"classes": [ |
|
{ |
|
"class": "Paragraphs", |
|
"description": "A written paragraph", |
|
"vectorizer": "text2vec-openai", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"model": "ada", |
|
"modelVersion": "002", |
|
"type": "text", |
|
} |
|
}, |
|
"properties": [ |
|
{ |
|
"dataType": ["text"], |
|
"description": "The content of the paragraph", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"skip": False, |
|
"vectorizePropertyName": False, |
|
} |
|
}, |
|
"name": "content", |
|
}, |
|
{ |
|
"dataType": ["text"], |
|
"description": "The link", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"skip": True, |
|
"vectorizePropertyName": False, |
|
} |
|
}, |
|
"name": "source", |
|
}, |
|
], |
|
}, |
|
] |
|
} |
|
|
|
client.schema.create(schema) |
|
|
|
with client.batch as batch: |
|
for text in documents: |
|
batch.add_data_object( |
|
{"content": text.page_content, "source": str(text.metadata["source"])}, |
|
"Paragraphs", |
|
) |
|
|