Spaces:
Runtime error
Runtime error
| """Load html from files, clean up, split, ingest into Weaviate.""" | |
| import os | |
| from pathlib import Path | |
| from markdown import markdown | |
| import pickle | |
| from bs4 import BeautifulSoup | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from InstructorEmbedding import INSTRUCTOR | |
| print(os.environ["HUGGINFACE_APIKEY"]) | |
| def clean_data(data): | |
| html = markdown(data) | |
| soup = BeautifulSoup(html, "html.parser") | |
| text = ''.join(soup.findAll(text=True)) | |
| return "\n".join([t for t in text.split("\n") if t]) | |
| docs = [] | |
| metadatas = [] | |
| for p in Path("docs").rglob("*"): | |
| if p.is_dir(): | |
| continue | |
| if str(p).lower().endswith(('.md', '.mdx')): | |
| with open(p) as f: | |
| print(p) | |
| filename = os.path.splitext(p)[0] | |
| docs.append(clean_data(f.read())) | |
| metadatas.append({"source": filename}) | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=512, | |
| chunk_overlap=64, | |
| length_function=len, | |
| ) | |
| documents = text_splitter.create_documents(docs, metadatas=metadatas) | |
| print("making embedding") | |
| embedding = HuggingFaceEmbeddings() | |
| print("beginning construction of faiss") | |
| search_index = FAISS.from_documents(documents, embedding) | |
| print("beginning pickle") | |
| with open("docs.pkl", 'wb') as f: | |
| pickle.dump(search_index, f) | |
| print("Pickle complete") |