Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +148 -131
evaluation.py
CHANGED
@@ -1,131 +1,148 @@
|
|
1 |
-
|
2 |
-
import numpy as np
|
3 |
-
from sklearn.metrics import mean_squared_error, roc_auc_score
|
4 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
-
|
7 |
-
from data_processing import load_query_dataset
|
8 |
-
|
9 |
-
global ground_truth_answer, ground_truth_metrics
|
10 |
-
|
11 |
-
ground_truth_answer = ''
|
12 |
-
ground_truth_metrics = {}
|
13 |
-
|
14 |
-
# def calculate_metrics(question, response, docs, time_taken):
|
15 |
-
# data = load_ragbench()
|
16 |
-
# retrieve_ground_truths(question, data)
|
17 |
-
# # Predicted metrics
|
18 |
-
# predicted_metrics = {
|
19 |
-
# "ground_truth": ground_truth_answer,
|
20 |
-
# "context_relevance": context_relevance(question, docs),
|
21 |
-
# "context_utilization": context_utilization(response, docs),
|
22 |
-
# "completeness": completeness(response, ground_truth_answer),
|
23 |
-
# "adherence": adherence(response, docs),
|
24 |
-
# "response_time" : time_taken
|
25 |
-
# }
|
26 |
-
# return predicted_metrics
|
27 |
-
|
28 |
-
# def retrieve_ground_truths(question,ragbench_set):
|
29 |
-
# for dataset_name in ragbench_set.keys():
|
30 |
-
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
31 |
-
# print(f"Processing {split_name} split")
|
32 |
-
# for instance in instances: # Fixed: Corrected indentation
|
33 |
-
# # Check if the question (data) matches the query
|
34 |
-
# if instance['question'] == question:
|
35 |
-
# # If a match is found, retrieve id and response
|
36 |
-
# instance_id = instance['id']
|
37 |
-
# instance_response = instance['response']
|
38 |
-
# ground_truth_metrics = {
|
39 |
-
# "context_relevance": instance['relevance_score'],
|
40 |
-
# "context_utilization": instance['utilization_score'],
|
41 |
-
# "completeness": instance['completeness_score'],
|
42 |
-
# "adherence": instance['adherence_score']
|
43 |
-
# }
|
44 |
-
# ground_truth_answer = instance_response
|
45 |
-
# print(f"Match found in {split_name} split!")
|
46 |
-
# print(f"ID: {instance_id}, Response: {instance_response}")
|
47 |
-
# break # Exit after finding the first match (optional)
|
48 |
-
|
49 |
-
# Step 1: Helper function to compute cosine similarity
|
50 |
-
def compute_cosine_similarity(text1, text2):
|
51 |
-
if not text1 or not text2: # Check for empty or None values
|
52 |
-
print("Error: One or both input texts are empty. Returning similarity as 0.")
|
53 |
-
return 0.0
|
54 |
-
|
55 |
-
vectorizer = TfidfVectorizer(stop_words="english")
|
56 |
-
|
57 |
-
try:
|
58 |
-
vectors = vectorizer.fit_transform([text1, text2])
|
59 |
-
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
|
60 |
-
return similarity
|
61 |
-
except ValueError as e:
|
62 |
-
print(f"Error in vectorization: {e}. Returning similarity as 0.")
|
63 |
-
return 0.0
|
64 |
-
|
65 |
-
# Step 2: Metric 1 - Context Relevance
|
66 |
-
def context_relevance(question, relevant_documents):
|
67 |
-
# combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
68 |
-
combined_docs = " ".join([doc for doc in relevant_documents])
|
69 |
-
return compute_cosine_similarity(question, combined_docs)
|
70 |
-
|
71 |
-
# Step 3: Metric 2 - Context Utilization
|
72 |
-
def context_utilization(response, relevant_documents):
|
73 |
-
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
74 |
-
combined_docs = " ".join([doc for doc in relevant_documents])
|
75 |
-
return compute_cosine_similarity(response, combined_docs)
|
76 |
-
|
77 |
-
# Step 4: Metric 3 - Completeness
|
78 |
-
def completeness(response, ground_truth_answer):
|
79 |
-
return compute_cosine_similarity(response, ground_truth_answer)
|
80 |
-
|
81 |
-
# Step 5: Metric 4 - Adherence
|
82 |
-
def adherence(response, relevant_documents):
|
83 |
-
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
84 |
-
combined_docs = " ".join([doc for doc in relevant_documents])
|
85 |
-
response_tokens = set(response.split())
|
86 |
-
relevant_tokens = set(combined_docs.split())
|
87 |
-
supported_tokens = response_tokens.intersection(relevant_tokens)
|
88 |
-
return len(supported_tokens) / len(response_tokens)
|
89 |
-
|
90 |
-
# Step 6: Compute RMSE for metrics
|
91 |
-
def compute_rmse(predicted_values, ground_truth_values):
|
92 |
-
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
93 |
-
|
94 |
-
def
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.metrics import mean_squared_error, roc_auc_score
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
|
7 |
+
from data_processing import load_query_dataset
|
8 |
+
|
9 |
+
global ground_truth_answer, ground_truth_metrics
|
10 |
+
|
11 |
+
ground_truth_answer = ''
|
12 |
+
ground_truth_metrics = {}
|
13 |
+
|
14 |
+
# def calculate_metrics(question, response, docs, time_taken):
|
15 |
+
# data = load_ragbench()
|
16 |
+
# retrieve_ground_truths(question, data)
|
17 |
+
# # Predicted metrics
|
18 |
+
# predicted_metrics = {
|
19 |
+
# "ground_truth": ground_truth_answer,
|
20 |
+
# "context_relevance": context_relevance(question, docs),
|
21 |
+
# "context_utilization": context_utilization(response, docs),
|
22 |
+
# "completeness": completeness(response, ground_truth_answer),
|
23 |
+
# "adherence": adherence(response, docs),
|
24 |
+
# "response_time" : time_taken
|
25 |
+
# }
|
26 |
+
# return predicted_metrics
|
27 |
+
|
28 |
+
# def retrieve_ground_truths(question,ragbench_set):
|
29 |
+
# for dataset_name in ragbench_set.keys():
|
30 |
+
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
31 |
+
# print(f"Processing {split_name} split")
|
32 |
+
# for instance in instances: # Fixed: Corrected indentation
|
33 |
+
# # Check if the question (data) matches the query
|
34 |
+
# if instance['question'] == question:
|
35 |
+
# # If a match is found, retrieve id and response
|
36 |
+
# instance_id = instance['id']
|
37 |
+
# instance_response = instance['response']
|
38 |
+
# ground_truth_metrics = {
|
39 |
+
# "context_relevance": instance['relevance_score'],
|
40 |
+
# "context_utilization": instance['utilization_score'],
|
41 |
+
# "completeness": instance['completeness_score'],
|
42 |
+
# "adherence": instance['adherence_score']
|
43 |
+
# }
|
44 |
+
# ground_truth_answer = instance_response
|
45 |
+
# print(f"Match found in {split_name} split!")
|
46 |
+
# print(f"ID: {instance_id}, Response: {instance_response}")
|
47 |
+
# break # Exit after finding the first match (optional)
|
48 |
+
|
49 |
+
# Step 1: Helper function to compute cosine similarity
|
50 |
+
def compute_cosine_similarity(text1, text2):
|
51 |
+
if not text1 or not text2: # Check for empty or None values
|
52 |
+
print("Error: One or both input texts are empty. Returning similarity as 0.")
|
53 |
+
return 0.0
|
54 |
+
|
55 |
+
vectorizer = TfidfVectorizer(stop_words="english")
|
56 |
+
|
57 |
+
try:
|
58 |
+
vectors = vectorizer.fit_transform([text1, text2])
|
59 |
+
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
|
60 |
+
return similarity
|
61 |
+
except ValueError as e:
|
62 |
+
print(f"Error in vectorization: {e}. Returning similarity as 0.")
|
63 |
+
return 0.0
|
64 |
+
|
65 |
+
# Step 2: Metric 1 - Context Relevance
|
66 |
+
def context_relevance(question, relevant_documents):
|
67 |
+
# combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
68 |
+
combined_docs = " ".join([doc for doc in relevant_documents])
|
69 |
+
return compute_cosine_similarity(question, combined_docs)
|
70 |
+
|
71 |
+
# Step 3: Metric 2 - Context Utilization
|
72 |
+
def context_utilization(response, relevant_documents):
|
73 |
+
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
74 |
+
combined_docs = " ".join([doc for doc in relevant_documents])
|
75 |
+
return compute_cosine_similarity(response, combined_docs)
|
76 |
+
|
77 |
+
# Step 4: Metric 3 - Completeness
|
78 |
+
def completeness(response, ground_truth_answer):
|
79 |
+
return compute_cosine_similarity(response, ground_truth_answer)
|
80 |
+
|
81 |
+
# Step 5: Metric 4 - Adherence
|
82 |
+
def adherence(response, relevant_documents):
|
83 |
+
#combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
84 |
+
combined_docs = " ".join([doc for doc in relevant_documents])
|
85 |
+
response_tokens = set(response.split())
|
86 |
+
relevant_tokens = set(combined_docs.split())
|
87 |
+
supported_tokens = response_tokens.intersection(relevant_tokens)
|
88 |
+
return len(supported_tokens) / len(response_tokens)
|
89 |
+
|
90 |
+
# Step 6: Compute RMSE for metrics
|
91 |
+
def compute_rmse(predicted_values, ground_truth_values):
|
92 |
+
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
93 |
+
|
94 |
+
def retrieve_ground_truths(question, dataset):
|
95 |
+
"""Retrieve the ground truth answer for a given question from the dataset."""
|
96 |
+
for split_name, instances in dataset.items():
|
97 |
+
for instance in instances:
|
98 |
+
if instance['question'] == question:
|
99 |
+
return instance['response'] # Return the ground truth response immediately
|
100 |
+
return None # Return None if no match is found
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
106 |
+
data = load_query_dataset(q_dataset)
|
107 |
+
ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
|
108 |
+
|
109 |
+
# Ensure ground_truth_answer is not empty before proceeding
|
110 |
+
if ground_truth_answer is None:
|
111 |
+
ground_truth_answer = "" # Default to an empty string if no ground truth is found
|
112 |
+
|
113 |
+
# Predicted metrics
|
114 |
+
predicted_metrics = {
|
115 |
+
"RAG_model_response": response,
|
116 |
+
"ground_truth": ground_truth_answer,
|
117 |
+
"context_relevance": context_relevance(question, docs),
|
118 |
+
"context_utilization": context_utilization(response, docs),
|
119 |
+
"completeness": completeness(response, ground_truth_answer),
|
120 |
+
"adherence": adherence(response, docs),
|
121 |
+
"response_time": time_taken
|
122 |
+
}
|
123 |
+
# If ground_truth_answer and predicted_metrics are available, compute RMSE
|
124 |
+
if ground_truth_answer and predicted_metrics:
|
125 |
+
# Assuming that we are calculating RMSE for completeness or other relevant metrics
|
126 |
+
rmse_value = compute_rmse([predicted_metrics['completeness']], [ground_truth_answer])
|
127 |
+
predicted_metrics['rmse'] = rmse_value
|
128 |
+
return predicted_metrics
|
129 |
+
|
130 |
+
''' def retrieve_ground_truths(question, dataset):
|
131 |
+
for split_name, instances in dataset.items():
|
132 |
+
print(f"Processing {split_name} split")
|
133 |
+
for instance in instances:
|
134 |
+
if instance['question'] == question:
|
135 |
+
instance_id = instance['id']
|
136 |
+
instance_response = instance['response']
|
137 |
+
# ground_truth_metrics = {
|
138 |
+
# "context_relevance": instance['relevance_score'],
|
139 |
+
# "context_utilization": instance['utilization_score'],
|
140 |
+
# "completeness": instance['completeness_score'],
|
141 |
+
# "adherence": instance['adherence_score']
|
142 |
+
# }
|
143 |
+
print(f"Match found in {split_name} split!")
|
144 |
+
print(f"ID: {instance_id}, Response: {instance_response}")
|
145 |
+
return instance_response # Return ground truth response immediately
|
146 |
+
|
147 |
+
return None # Return None if no match is found
|
148 |
+
'''
|