Spaces:

georgeek
/

HF-LLM-Intent-Detection

Running

App Files Files Community

HF-LLM-Intent-Detection / src /E_openAI_embeddings.py

georgeek

Transfer

5ecde30 9 days ago

raw

history blame contribute delete

2.59 kB

	import openai
	import pandas as pd
	import numpy as np
	import faiss
	import os
	from dotenv import load_dotenv
	from sklearn.metrics.pairwise import cosine_similarity

	load_dotenv() # take environment variables from .env.
	api_key = os.getenv('OPENAI_API_KEY')
	#print(api_key)


	from openai import OpenAI
	client = OpenAI()

	def get_openai_embedding(text, model="text-embedding-3-small"):
	text = text.replace("\n", " ")
	return client.embeddings.create(input = [text], model=model).data[0].embedding


	def save_openai_embeddings(csv_file, model='text-embedding-3-small'):
	# Load the CSV file
	df = pd.read_csv(csv_file)
	# Save the embeddings
	df['ada_embeddings'] = df.utterance.apply(lambda x: get_openai_embedding(x, model='text-embedding-3-small'))
	df.to_csv('embeddings/openai_embeddings.csv', index=False)
	print(f"Embeddings saved to embeddings/openai_embeddings.csv.")

	# get and save the embeddings for Intent cleared data

	#save_openai_embeddings(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv')


	def load_openai_embeddings(csv_file):
	# Load the CSV file
	df = pd.read_csv(csv_file)
	# Extract the embeddings
	embeddings = df['ada_embeddings'].tolist()
	return embeddings


	# Function to calculate similarity between user input and precomputed embeddings
	def calculate_openai_similarity(user_text, df, top_n=5):
	# Get embedding for the user input text
	user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1)

	# Calculate cosine similarity between user input and all precomputed embeddings
	df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0])

	# Sort by similarity score (descending) and return the top_n most similar
	top_matches = df.sort_values(by='similarity', ascending=False).head(top_n)

	return top_matches[['combined', 'similarity']]

	def get_openai_similarity(user_text, df, top_n=5):
	# Get embedding for the user input text
	user_embedding = np.array(get_openai_embedding(user_text, model='text-embedding-3-small')).reshape(1, -1)

	# Calculate cosine similarity between user input and all precomputed embeddings
	df['similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([x], user_embedding)[0][0])

	# Sort by similarity score (descending) and return the top_n most similar
	top_matches = df.sort_values(by='similarity', ascending=False).head(top_n)

	return top_matches[['combined', 'similarity']]