File size: 2,591 Bytes
5ecde30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import openai
import pandas as pd
import numpy as np
import faiss
import os
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
load_dotenv() # take environment variables from .env.
api_key = os.getenv('OPENAI_API_KEY')
#print(api_key)
from openai import OpenAI
client = OpenAI()
def get_openai_embedding(text, model="text-embedding-3-small"):
text = text.replace("\n", " ")
return client.embeddings.create(input = [text], model=model).data[0].embedding
def save_openai_embeddings(csv_file, model='text-embedding-3-small'):
# Load the CSV file
df = pd.read_csv(csv_file)
# Save the embeddings
df['ada_embeddings'] = df.utterance.apply(lambda x: get_openai_embedding(x, model='text-embedding-3-small'))
df.to_csv('embeddings/openai_embeddings.csv', index=False)
print(f"Embeddings saved to embeddings/openai_embeddings.csv.")
# get and save the embeddings for Intent cleared data
#save_openai_embeddings(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv')
def load_openai_embeddings(csv_file):
# Load the CSV file
df = pd.read_csv(csv_file)
# Extract the embeddings
embeddings = df['ada_embeddings'].tolist()
return embeddings
# Function to calculate similarity between user input and precomputed embeddings
def calculate_openai_similarity(user_text, df, top_n=5):
# Get embedding for the user input text
user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1)
# Calculate cosine similarity between user input and all precomputed embeddings
df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0])
# Sort by similarity score (descending) and return the top_n most similar
top_matches = df.sort_values(by='similarity', ascending=False).head(top_n)
return top_matches[['combined', 'similarity']]
def get_openai_similarity(user_text, df, top_n=5):
# Get embedding for the user input text
user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1)
# Calculate cosine similarity between user input and all precomputed embeddings
df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0])
# Sort by similarity score (descending) and return the top_n most similar
top_matches = df.sort_values(by='similarity', ascending=False).head(top_n)
return top_matches[['combined', 'similarity']]
|