|
import openai |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import os |
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
load_dotenv() |
|
api_key = os.getenv('OPENAI_API_KEY') |
|
|
|
|
|
def get_embedding(text, model="text-embedding-3-small"): |
|
text = text.replace("\n", " ") |
|
response = openai.Embedding.create(input=[text], model=model) |
|
return response['data'][0]['embedding'] |
|
|
|
|
|
def calculate_similarity(user_text, df, top_n=3): |
|
|
|
user_embedding = np.array(get_embedding(user_text, model='text-embedding-ada-002')).reshape(1, -1) |
|
|
|
|
|
similarities = [] |
|
for _, row in df.iterrows(): |
|
similarity_score = cosine_similarity([row['ada_embeddings']], user_embedding)[0][0] |
|
similarities.append((row['utterance'], similarity_score)) |
|
|
|
|
|
top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n] |
|
|
|
|
|
print("Top similar sentences:") |
|
for sentence, score in top_matches: |
|
print(f"Sentence: {sentence}, Similarity: {score:.4f}") |
|
|
|
return top_matches |
|
|
|
|
|
|
|
def save_openai_embeddings(csv_file, model='text-embedding-3-small'): |
|
|
|
df = pd.read_csv(csv_file) |
|
|
|
|
|
df['ada_embeddings'] = df.utterance.apply(lambda x: get_embedding(x, model='text-embedding-3-small')) |
|
df.to_csv('embeddings/openai_embeddings.csv', index=False) |
|
print(f"Embeddings saved to embeddings/openai_embeddings.csv.") |
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\embeddings\openai_embeddings.csv') |
|
|
|
print(df.head) |
|
|
|
df['ada_embeddings'] = df['ada_embeddings'].apply(eval).apply(np.array) |
|
|
|
|
|
user_input = "Cat am de plata la ultima factura?" |
|
|
|
|
|
top_similar_reviews = calculate_similarity(user_input, df, top_n=3) |
|
|