Spaces:
Sleeping
Sleeping
| import openai | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import os | |
| from dotenv import load_dotenv | |
| # Initialize OpenAI client (replace with your API key) | |
| load_dotenv() # take environment variables from .env. | |
| api_key = os.getenv('OPENAI_API_KEY') | |
| # Function to get OpenAI embeddings for a text input | |
| def get_embedding(text, model="text-embedding-3-small"): | |
| text = text.replace("\n", " ") | |
| response = openai.Embedding.create(input=[text], model=model) | |
| return response['data'][0]['embedding'] | |
| # Function to calculate similarity and return top N similar sentences (without modifying df) | |
| def calculate_similarity(user_text, df, top_n=3): | |
| # Get embedding for the user input text | |
| user_embedding = np.array(get_embedding(user_text, model='text-embedding-ada-002')).reshape(1, -1) | |
| # Calculate similarity for each sentence in the DataFrame without creating a new column | |
| similarities = [] | |
| for _, row in df.iterrows(): | |
| similarity_score = cosine_similarity([row['ada_embeddings']], user_embedding)[0][0] | |
| similarities.append((row['utterance'], similarity_score)) | |
| # Sort by similarity score (descending) and return the top_n most similar sentences | |
| top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n] | |
| # Print the results | |
| print("Top similar sentences:") | |
| for sentence, score in top_matches: | |
| print(f"Sentence: {sentence}, Similarity: {score:.4f}") | |
| return top_matches | |
| def save_openai_embeddings(csv_file, model='text-embedding-3-small'): | |
| # Load the CSV file | |
| df = pd.read_csv(csv_file) | |
| #print(df.head) | |
| # Save the embeddings | |
| df['ada_embeddings'] = df.utterance.apply(lambda x: get_embedding(x, model='text-embedding-3-small')) | |
| df.to_csv('embeddings/openai_embeddings.csv', index=False) | |
| print(f"Embeddings saved to embeddings/openai_embeddings.csv.") | |
| # Only One time rutine: save_openai_embeddings('data/Pager_Intents_cleaned.csv') | |
| # Load precomputed embeddings from CSV | |
| df = pd.read_csv(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\embeddings\openai_embeddings.csv') | |
| print(df.head) | |
| df['ada_embeddings'] = df['ada_embeddings'].apply(eval).apply(np.array) | |
| # Test user input | |
| user_input = "Cat am de plata la ultima factura?" | |
| # Calculate and print top 3 similar sentences | |
| top_similar_reviews = calculate_similarity(user_input, df, top_n=3) | |