|
import openai |
|
import pandas as pd |
|
import numpy as np |
|
import faiss |
|
import os |
|
from dotenv import load_dotenv |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
load_dotenv() |
|
api_key = os.getenv('OPENAI_API_KEY') |
|
|
|
|
|
|
|
from openai import OpenAI |
|
client = OpenAI() |
|
|
|
def get_openai_embedding(text, model="text-embedding-3-small"): |
|
text = text.replace("\n", " ") |
|
return client.embeddings.create(input = [text], model=model).data[0].embedding |
|
|
|
|
|
def save_openai_embeddings(csv_file, model='text-embedding-3-small'): |
|
|
|
df = pd.read_csv(csv_file) |
|
|
|
df['ada_embeddings'] = df.utterance.apply(lambda x: get_openai_embedding(x, model='text-embedding-3-small')) |
|
df.to_csv('embeddings/openai_embeddings.csv', index=False) |
|
print(f"Embeddings saved to embeddings/openai_embeddings.csv.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_openai_embeddings(csv_file): |
|
|
|
df = pd.read_csv(csv_file) |
|
|
|
embeddings = df['ada_embeddings'].tolist() |
|
return embeddings |
|
|
|
|
|
|
|
def calculate_openai_similarity(user_text, df, top_n=5): |
|
|
|
user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1) |
|
|
|
|
|
df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0]) |
|
|
|
|
|
top_matches = df.sort_values(by='similarity', ascending=False).head(top_n) |
|
|
|
return top_matches[['combined', 'similarity']] |
|
|
|
def get_openai_similarity(user_text, df, top_n=5): |
|
|
|
user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1) |
|
|
|
|
|
df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0]) |
|
|
|
|
|
top_matches = df.sort_values(by='similarity', ascending=False).head(top_n) |
|
|
|
return top_matches[['combined', 'similarity']] |
|
|
|
|
|
|
|
|
|
|