Spaces:

georgeek
/

HF-LLM-Intent-Detection

Running

App Files Files Community

HF-LLM-Intent-Detection / src /Z_openAI_embeddings_test.py

georgeek

Transfer

5ecde30 9 days ago

raw

history blame contribute delete

2.45 kB

	import openai
	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	import os
	from dotenv import load_dotenv


	# Initialize OpenAI client (replace with your API key)
	load_dotenv() # take environment variables from .env.
	api_key = os.getenv('OPENAI_API_KEY')

	# Function to get OpenAI embeddings for a text input
	def get_embedding(text, model="text-embedding-3-small"):
	text = text.replace("\n", " ")
	response = openai.Embedding.create(input=[text], model=model)
	return response['data'][0]['embedding']

	# Function to calculate similarity and return top N similar sentences (without modifying df)
	def calculate_similarity(user_text, df, top_n=3):
	# Get embedding for the user input text
	user_embedding = np.array(get_embedding(user_text, model='text-embedding-ada-002')).reshape(1, -1)

	# Calculate similarity for each sentence in the DataFrame without creating a new column
	similarities = []
	for _, row in df.iterrows():
	similarity_score = cosine_similarity([row['ada_embeddings']], user_embedding)[0][0]
	similarities.append((row['utterance'], similarity_score))

	# Sort by similarity score (descending) and return the top_n most similar sentences
	top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]

	# Print the results
	print("Top similar sentences:")
	for sentence, score in top_matches:
	print(f"Sentence: {sentence}, Similarity: {score:.4f}")

	return top_matches



	def save_openai_embeddings(csv_file, model='text-embedding-3-small'):
	# Load the CSV file
	df = pd.read_csv(csv_file)
	#print(df.head)
	# Save the embeddings
	df['ada_embeddings'] = df.utterance.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
	df.to_csv('embeddings/openai_embeddings.csv', index=False)
	print(f"Embeddings saved to embeddings/openai_embeddings.csv.")

	# Only One time rutine: save_openai_embeddings('data/Pager_Intents_cleaned.csv')


	# Load precomputed embeddings from CSV
	df = pd.read_csv(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\embeddings\openai_embeddings.csv')

	print(df.head)

	df['ada_embeddings'] = df['ada_embeddings'].apply(eval).apply(np.array)

	# Test user input
	user_input = "Cat am de plata la ultima factura?"

	# Calculate and print top 3 similar sentences
	top_similar_reviews = calculate_similarity(user_input, df, top_n=3)