RAG_voice / vector_store.py
jeongsoo's picture
Add application file
4a98f26
"""
๊ฐœ์„ ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋ชจ๋“ˆ - Milvus ์„ค์ • ์ตœ์ ํ™”
"""
from typing import List, Dict, Any, Optional
import uuid
from langchain.schema import Document
# ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ž„ํฌํŠธ
try:
# ์ตœ์‹  ๋ฒ„์ „ ์ž„ํฌํŠธ
from langchain_milvus import Milvus
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
MODERN_IMPORTS = True
print("์ตœ์‹  langchain ํŒจํ‚ค์ง€ ์ž„ํฌํŠธ ์„ฑ๊ณต")
except ImportError:
# ์ด์ „ ๋ฒ„์ „ ์ž„ํฌํŠธ
from langchain_community.vectorstores import Milvus, FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
MODERN_IMPORTS = False
print("๋ ˆ๊ฑฐ์‹œ langchain_community ํŒจํ‚ค์ง€ ์‚ฌ์šฉ")
from config import MILVUS_HOST, MILVUS_PORT, MILVUS_COLLECTION, EMBEDDING_MODEL
class VectorStore:
def __init__(self, use_milvus: bool = True):
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™”
Args:
use_milvus: Milvus ์‚ฌ์šฉ ์—ฌ๋ถ€ (False์ด๋ฉด FAISS ์‚ฌ์šฉ)
"""
self.use_milvus = use_milvus
# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ค์ •
print(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ ์ค‘: {EMBEDDING_MODEL}")
model_kwargs = {
"device": "cpu",
"trust_remote_code": True # ์›๊ฒฉ ์ฝ”๋“œ ์‹คํ–‰ ํ—ˆ์šฉ (ํ•„์ˆ˜)
}
encode_kwargs = {"normalize_embeddings": True}
self.embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
self.vector_store = None
print(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ดˆ๊ธฐํ™” ์™„๋ฃŒ: {EMBEDDING_MODEL}")
def init_milvus(self) -> Milvus:
"""
Milvus ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™”
Returns:
Milvus ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
connection_args = {
"host": MILVUS_HOST,
"port": MILVUS_PORT,
}
# ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์ธ๋ฑ์Šค ํŒŒ๋ผ๋ฏธํ„ฐ (FLAT ์ธ๋ฑ์Šค ๋ฐ ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๋ฉ”ํŠธ๋ฆญ)
index_params = {
"index_type": "FLAT", # ์ •ํ™•๋„ ์šฐ์„  FLAT ์ธ๋ฑ์Šค
"metric_type": "COSINE", # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ (์ •๊ทœํ™”๋œ ๋ฒกํ„ฐ์— ์ ํ•ฉ)
"params": {} # FLAT ์ธ๋ฑ์Šค์—๋Š” ์ถ”๊ฐ€ ํŒŒ๋ผ๋ฏธํ„ฐ ์—†์Œ
}
return Milvus(
embedding_function=self.embeddings,
collection_name=MILVUS_COLLECTION,
connection_args=connection_args,
index_params=index_params
)
def init_faiss(self) -> FAISS:
"""
FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ดˆ๊ธฐํ™” (๋กœ์ปฌ ๋Œ€์ฒด์šฉ)
Returns:
FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
return FAISS.from_documents([], self.embeddings)
def create_or_load(self, documents: Optional[List[Document]] = None) -> Any:
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ƒ์„ฑ ๋˜๋Š” ๋กœ๋“œ
Args:
documents: ์ €์žฅํ•  ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ (None์ด๋ฉด ๋นˆ ์Šคํ† ์–ด ์ƒ์„ฑ)
Returns:
๋ฒกํ„ฐ ์Šคํ† ์–ด ์ธ์Šคํ„ด์Šค
"""
if self.use_milvus:
if documents:
# ๋ฌธ์„œ๊ฐ€ ์ œ๊ณต๋œ ๊ฒฝ์šฐ ์ƒˆ ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
try:
# ์—ฐ๊ฒฐ ์„ค์ •
connection_args = {
"host": MILVUS_HOST,
"port": MILVUS_PORT,
}
# ๊ฒ€์ƒ‰ ์ธ๋ฑ์Šค ์„ค์ •
index_params = {
"index_type": "FLAT", # ์ •ํ™•๋„ ์šฐ์„ 
"metric_type": "COSINE", # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„
"params": {}
}
print(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ: {MILVUS_COLLECTION} (๊ธฐ์กด ์ปฌ๋ ‰์…˜ ์‚ญ์ œ)")
# ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
self.vector_store = Milvus.from_documents(
documents=documents,
embedding=self.embeddings,
collection_name=MILVUS_COLLECTION,
connection_args=connection_args,
index_params=index_params,
drop_old=True # ๊ธฐ์กด ์ปฌ๋ ‰์…˜ ์‚ญ์ œ (์žฌ๊ตฌ์ถ•)
)
print(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์™„๋ฃŒ: {len(documents)}๊ฐœ ๋ฌธ์„œ ์ธ๋ฑ์‹ฑ๋จ")
except Exception as e:
print(f"Milvus ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์‹คํŒจ: {e}")
# ๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ
print("๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ")
self.use_milvus = False
self.vector_store = FAISS.from_documents(documents, self.embeddings)
else:
# ๊ธฐ์กด ์ปฌ๋ ‰์…˜ ๋กœ๋“œ
try:
self.vector_store = self.init_milvus()
except Exception as e:
print(f"Milvus ์ปฌ๋ ‰์…˜ ๋กœ๋“œ ์‹คํŒจ: {e}")
# ๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ
print("๋Œ€์ฒด ๋ฐฉ์•ˆ์œผ๋กœ FAISS ์‚ฌ์šฉ")
self.use_milvus = False
self.vector_store = self.init_faiss()
else:
# FAISS ์‚ฌ์šฉ
if documents:
print(f"FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ: {len(documents)}๊ฐœ ๋ฌธ์„œ")
self.vector_store = FAISS.from_documents(documents, self.embeddings)
print("FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ ์™„๋ฃŒ")
else:
self.vector_store = self.init_faiss()
print("๋นˆ FAISS ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
return self.vector_store
def add_documents(self, documents: List[Document]) -> None:
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด์— ๋ฌธ์„œ ์ถ”๊ฐ€
Args:
documents: ์ถ”๊ฐ€ํ•  ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ
"""
if self.vector_store is None:
self.create_or_load(documents)
else:
if self.use_milvus:
self.vector_store.add_documents(documents)
else:
self.vector_store.add_documents(documents)
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
"""
๋ฒกํ„ฐ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
Args:
query: ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
k: ๋ฐ˜ํ™˜ํ•  ๊ฒฐ๊ณผ ์ˆ˜
Returns:
์œ ์‚ฌ๋„๊ฐ€ ๋†’์€ ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ
"""
if self.vector_store is None:
raise ValueError("๋ฒกํ„ฐ ์Šคํ† ์–ด๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
print(f"๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ: '{query}', ์ƒ์œ„ {k}๊ฐœ ๊ฒฐ๊ณผ ์š”์ฒญ")
results = self.vector_store.similarity_search(query, k=k)
print(f"๊ฒ€์ƒ‰ ์™„๋ฃŒ: {len(results)}๊ฐœ ๊ฒฐ๊ณผ ์ฐพ์Œ")
return results
def save_local(self, path: str = "faiss_index") -> None:
"""
FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ์ €์žฅ (Milvus ์‚ฌ์šฉ ์•ˆ ํ•  ๊ฒฝ์šฐ)
Args:
path: ์ €์žฅ ๊ฒฝ๋กœ
"""
if not self.use_milvus and self.vector_store is not None:
self.vector_store.save_local(path)
print(f"FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ์ €์žฅ ์™„๋ฃŒ: {path}")
"""
FAISS ์—ญ์ง๋ ฌํ™” ํ—ˆ์šฉ ์„ค์ •์ด, ํฌํ•จ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ฝ”๋“œ
"""
# vector_store.py ํŒŒ์ผ์—์„œ load_local ๋ฉ”์„œ๋“œ ์ˆ˜์ •
def load_local(self, path: str = "faiss_index") -> None:
"""
FAISS ์ธ๋ฑ์Šค ๋กœ์ปฌ ๋กœ๋“œ (Milvus ์‚ฌ์šฉ ์•ˆ ํ•  ๊ฒฝ์šฐ)
Args:
path: ๋กœ๋“œํ•  ์ธ๋ฑ์Šค ๊ฒฝ๋กœ
"""
if not self.use_milvus:
try:
print(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์ค‘: {path}")
# ์—ญ์ง๋ ฌํ™” ํ—ˆ์šฉ ์˜ต์…˜ ์ถ”๊ฐ€ (๋ณด์•ˆ ๊ฒฝ๊ณ  ํ™•์ธ ํ•„์š”)
self.vector_store = FAISS.load_local(
path,
self.embeddings,
allow_dangerous_deserialization=True # ์—ญ์ง๋ ฌํ™” ํ—ˆ์šฉ
)
print(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์™„๋ฃŒ: {path}")
except Exception as e:
print(f"FAISS ์ธ๋ฑ์Šค ๋กœ๋“œ ์‹คํŒจ: {e}")
# ์˜ค๋ฅ˜ ์„ธ๋ถ€ ์ •๋ณด ์ถœ๋ ฅ
import traceback
traceback.print_exc()
# ์ƒˆ ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”
self.vector_store = self.init_faiss()
print("์ƒˆ FAISS ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”๋จ")