File size: 2,447 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import openai
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
from dotenv import load_dotenv


# Initialize OpenAI client (replace with your API key)
load_dotenv()  # take environment variables from .env.
api_key = os.getenv('OPENAI_API_KEY')

# Function to get OpenAI embeddings for a text input
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    response = openai.Embedding.create(input=[text], model=model)    
    return response['data'][0]['embedding']

# Function to calculate similarity and return top N similar sentences (without modifying df)
def calculate_similarity(user_text, df, top_n=3):
    # Get embedding for the user input text
    user_embedding = np.array(get_embedding(user_text, model='text-embedding-ada-002')).reshape(1, -1)

    # Calculate similarity for each sentence in the DataFrame without creating a new column
    similarities = []
    for _, row in df.iterrows():
        similarity_score = cosine_similarity([row['ada_embeddings']], user_embedding)[0][0]
        similarities.append((row['utterance'], similarity_score))
    
    # Sort by similarity score (descending) and return the top_n most similar sentences
    top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
    
    # Print the results
    print("Top similar sentences:")
    for sentence, score in top_matches:
        print(f"Sentence: {sentence}, Similarity: {score:.4f}")
    
    return top_matches



def save_openai_embeddings(csv_file, model='text-embedding-3-small'):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    #print(df.head)
    # Save the embeddings
    df['ada_embeddings'] = df.utterance.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
    df.to_csv('embeddings/openai_embeddings.csv', index=False)
    print(f"Embeddings saved to embeddings/openai_embeddings.csv.")

# Only One time rutine: save_openai_embeddings('data/Pager_Intents_cleaned.csv')


# Load precomputed embeddings from CSV
df = pd.read_csv(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\embeddings\openai_embeddings.csv')

print(df.head)

df['ada_embeddings'] = df['ada_embeddings'].apply(eval).apply(np.array)

# Test user input
user_input = "Cat am de plata la ultima factura?"

# Calculate and print top 3 similar sentences
top_similar_reviews = calculate_similarity(user_input, df, top_n=3)