chat-with-docs / create_database.py
mehmet0001's picture
Update create_database.py
1dd505d verified
from sentence_transformers import SentenceTransformer
import chromadb
def split_list(list_,chunk_size):
return [list_[i:i+chunk_size] for i in range(0,len(list_),chunk_size)]
def create_database(txt):
class EmbeddingFn:
def __init__(self,model_name):
self.model = SentenceTransformer(model_name)
def __call__(self,input):
return self.model.encode(input).tolist()
embedding_fn = EmbeddingFn("sentence-transformers/all-mpnet-base-v2")
ids = [str(i) for i in range(len(txt))]
chromadb.api.client.SharedSystemClient.clear_system_cache()
chroma_cli = chromadb.Client()
existing_collections = [collection.name for collection in chroma_cli.list_collections()]
if "chat-with-docs" in existing_collections:
chroma_cli.delete_collection(name="chat-with-docs")
collection = chroma_cli.create_collection("chat-with-docs",embedding_function=embedding_fn)
txt = split_list(txt,5000)
ids = split_list(ids,5000)
for txt_chunk,ids_chunk in zip(txt,ids):
collection.add(documents=txt_chunk,ids=ids_chunk)
return collection