|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: |
|
|
|
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) |
|
|
|
|
|
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) |
|
|
|
|
|
to_keep = set() |
|
|
|
for i in range(len(df)): |
|
if i not in to_keep: |
|
to_keep.add(i) |
|
for j in range(i + 1, len(df)): |
|
if cosine_scores[i][j] >= 0.8: |
|
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") |
|
to_keep.add(j) |
|
|
|
|
|
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) |
|
|
|
return filtered_df |
|
|
|
|
|
|
|
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: |
|
|
|
user_embedding = model.encode(user_text, convert_to_tensor=True) |
|
|
|
|
|
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) |
|
|
|
|
|
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] |
|
|
|
|
|
top_matches = cosine_scores.argsort(descending=True)[:top_n] |
|
|
|
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] |
|
|
|
|
|
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' |
|
|
|
|
|
utterances = pd.read_csv(file_path) |
|
|
|
|
|
|
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] |
|
|
|
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: |
|
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) |
|
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) |
|
to_keep = set() |
|
for i in range(len(df)): |
|
if i not in to_keep: |
|
to_keep.add(i) |
|
for j in range(i + 1, len(df)): |
|
if cosine_scores[i][j] >= 0.8: |
|
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") |
|
to_keep.add(j) |
|
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) |
|
return filtered_df |
|
|
|
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: |
|
user_embedding = model.encode(user_text, convert_to_tensor=True) |
|
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) |
|
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] |
|
top_matches = cosine_scores.argsort(descending=True)[:top_n] |
|
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] |
|
|
|
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' |
|
utterances = pd.read_csv(file_path) |
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] |
|
|
|
for example in examples: |
|
print(f"Input: {example}") |
|
print(similar_sentences) |
|
print("\n") |