import pandas as pd from sentence_transformers import SentenceTransformer, util def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: # Calculate embeddings for each utterance embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) # Calculate cosine similarity matrix cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) # Keep track of sentences to keep to_keep = set() for i in range(len(df)): if i not in to_keep: to_keep.add(i) for j in range(i + 1, len(df)): if cosine_scores[i][j] >= 0.8: print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") to_keep.add(j) # Filter the dataframe to keep only the selected sentences filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) return filtered_df def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: # Get embeddings for user input user_embedding = model.encode(user_text, convert_to_tensor=True) # Get embeddings for all utterances embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) # Calculate cosine similarity between user input and all utterances cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] # Get top_n most similar utterances top_matches = cosine_scores.argsort(descending=True)[:top_n] return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' # Load the data utterances = pd.read_csv(file_path) # Load the model multilingual-e5-small from sentence-transformers # 'sentence-transformers/all-MiniLM-L6-v2' model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Filter similar sentences #filtered_utterances = filter_similar_sentences(model, utterances) # Display the filtered dataframe #filtered_utterances.head() examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) to_keep = set() for i in range(len(df)): if i not in to_keep: to_keep.add(i) for j in range(i + 1, len(df)): if cosine_scores[i][j] >= 0.8: print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") to_keep.add(j) filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) return filtered_df def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: user_embedding = model.encode(user_text, convert_to_tensor=True) embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] top_matches = cosine_scores.argsort(descending=True)[:top_n] return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' utterances = pd.read_csv(file_path) model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] for example in examples: print(f"Input: {example}") print(similar_sentences) print("\n")