|
import os |
|
import pandas as pd |
|
from openai import OpenAI |
|
import streamlit as st |
|
from dotenv import load_dotenv |
|
import re |
|
|
|
load_dotenv() |
|
api_key = os.getenv('OPENAI_API_KEY') |
|
|
|
|
|
def get_ballanced_intents(data): |
|
intents = data['intent'].value_counts() |
|
intents = intents[intents <= 40].index.tolist() |
|
filtered_data = data[data['intent'].isin(intents)] |
|
print(f"Filtered data shape: {filtered_data.shape}") |
|
return filtered_data |
|
|
|
|
|
|
|
|
|
|
|
def create_prompt(user_text, utterances): |
|
prompt = f"User text: {user_text}\n\nUtterance examples:\n" |
|
for i, utterance in enumerate(utterances): |
|
prompt += f"{i + 1}. {utterance}\n" |
|
prompt += "\nPlease rate the similarity of the user text to each of the utterance examples on a scale from 0 to 1." |
|
return prompt |
|
|
|
|
|
def get_similarity_scores(prompt, temperature): |
|
load_dotenv() |
|
api_key = os.getenv('OPENAI_API_KEY') |
|
|
|
if not api_key: |
|
st.write("Please set the OPENAI_API_KEY environment variable.") |
|
return |
|
|
|
|
|
client = OpenAI(api_key='sk-proj-SNnm3Z9t6BvJgQ3ztjWLT3BlbkFJ1037ZT68ltN47zcir44l') |
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful assistant. Respond without details in ()"}, |
|
{"role": "user", "content": prompt} |
|
], |
|
max_tokens=150, |
|
n=1, |
|
stop='8.', |
|
temperature=temperature, |
|
) |
|
|
|
similarities = response.choices[0].message.content.strip().split('\n') |
|
|
|
for sim in similarities: |
|
|
|
print(sim.split('- ')[-1]) |
|
similarity_scores = [sim.split('- ')[-1] for sim in similarities] |
|
similarity_scores = [sim if re.match(r'^0\.[1-9]$|^1\.0$', sim) else '0' for sim in similarity_scores] |
|
|
|
return similarity_scores |
|
|
|
|
|
def get_most_similar_intent(user_text, utterances, intents, temperature=0.5): |
|
prompt = create_prompt(user_text, utterances) |
|
similarity_scores = get_similarity_scores(prompt, temperature) |
|
print(similarity_scores) |
|
max_index = similarity_scores.index(max(similarity_scores)) |
|
print(f'max_index: {max_index}') |
|
most_similar_intent = intents[max_index] |
|
|
|
return most_similar_intent, similarity_scores[max_index] |
|
|
|
|
|
|
|
|
|
|