vamseelatha2002 commited on
Commit
5f5fc92
·
verified ·
1 Parent(s): 8306d3f

Update evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +148 -131
evaluation.py CHANGED
@@ -1,131 +1,148 @@
1
-
2
- import numpy as np
3
- from sklearn.metrics import mean_squared_error, roc_auc_score
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
-
7
- from data_processing import load_query_dataset
8
-
9
- global ground_truth_answer, ground_truth_metrics
10
-
11
- ground_truth_answer = ''
12
- ground_truth_metrics = {}
13
-
14
- # def calculate_metrics(question, response, docs, time_taken):
15
- # data = load_ragbench()
16
- # retrieve_ground_truths(question, data)
17
- # # Predicted metrics
18
- # predicted_metrics = {
19
- # "ground_truth": ground_truth_answer,
20
- # "context_relevance": context_relevance(question, docs),
21
- # "context_utilization": context_utilization(response, docs),
22
- # "completeness": completeness(response, ground_truth_answer),
23
- # "adherence": adherence(response, docs),
24
- # "response_time" : time_taken
25
- # }
26
- # return predicted_metrics
27
-
28
- # def retrieve_ground_truths(question,ragbench_set):
29
- # for dataset_name in ragbench_set.keys():
30
- # for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
31
- # print(f"Processing {split_name} split")
32
- # for instance in instances: # Fixed: Corrected indentation
33
- # # Check if the question (data) matches the query
34
- # if instance['question'] == question:
35
- # # If a match is found, retrieve id and response
36
- # instance_id = instance['id']
37
- # instance_response = instance['response']
38
- # ground_truth_metrics = {
39
- # "context_relevance": instance['relevance_score'],
40
- # "context_utilization": instance['utilization_score'],
41
- # "completeness": instance['completeness_score'],
42
- # "adherence": instance['adherence_score']
43
- # }
44
- # ground_truth_answer = instance_response
45
- # print(f"Match found in {split_name} split!")
46
- # print(f"ID: {instance_id}, Response: {instance_response}")
47
- # break # Exit after finding the first match (optional)
48
-
49
- # Step 1: Helper function to compute cosine similarity
50
- def compute_cosine_similarity(text1, text2):
51
- if not text1 or not text2: # Check for empty or None values
52
- print("Error: One or both input texts are empty. Returning similarity as 0.")
53
- return 0.0
54
-
55
- vectorizer = TfidfVectorizer(stop_words="english")
56
-
57
- try:
58
- vectors = vectorizer.fit_transform([text1, text2])
59
- similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
60
- return similarity
61
- except ValueError as e:
62
- print(f"Error in vectorization: {e}. Returning similarity as 0.")
63
- return 0.0
64
-
65
- # Step 2: Metric 1 - Context Relevance
66
- def context_relevance(question, relevant_documents):
67
- # combined_docs = " ".join([doc.page_content for doc in relevant_documents])
68
- combined_docs = " ".join([doc for doc in relevant_documents])
69
- return compute_cosine_similarity(question, combined_docs)
70
-
71
- # Step 3: Metric 2 - Context Utilization
72
- def context_utilization(response, relevant_documents):
73
- #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
74
- combined_docs = " ".join([doc for doc in relevant_documents])
75
- return compute_cosine_similarity(response, combined_docs)
76
-
77
- # Step 4: Metric 3 - Completeness
78
- def completeness(response, ground_truth_answer):
79
- return compute_cosine_similarity(response, ground_truth_answer)
80
-
81
- # Step 5: Metric 4 - Adherence
82
- def adherence(response, relevant_documents):
83
- #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
84
- combined_docs = " ".join([doc for doc in relevant_documents])
85
- response_tokens = set(response.split())
86
- relevant_tokens = set(combined_docs.split())
87
- supported_tokens = response_tokens.intersection(relevant_tokens)
88
- return len(supported_tokens) / len(response_tokens)
89
-
90
- # Step 6: Compute RMSE for metrics
91
- def compute_rmse(predicted_values, ground_truth_values):
92
- return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
93
-
94
- def calculate_metrics(question, q_dataset, response, docs, time_taken):
95
- data = load_query_dataset(q_dataset)
96
- ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
97
-
98
- # Ensure ground_truth_answer is not empty before proceeding
99
- if ground_truth_answer is None:
100
- ground_truth_answer = "" # Default to an empty string if no ground truth is found
101
-
102
- # Predicted metrics
103
- predicted_metrics = {
104
- "RAG_model_response": response,
105
- "ground_truth": ground_truth_answer,
106
- "context_relevance": context_relevance(question, docs),
107
- "context_utilization": context_utilization(response, docs),
108
- "completeness": completeness(response, ground_truth_answer),
109
- "adherence": adherence(response, docs),
110
- "response_time": time_taken
111
- }
112
- return predicted_metrics
113
-
114
- def retrieve_ground_truths(question, dataset):
115
- for split_name, instances in dataset.items():
116
- print(f"Processing {split_name} split")
117
- for instance in instances:
118
- if instance['question'] == question:
119
- instance_id = instance['id']
120
- instance_response = instance['response']
121
- # ground_truth_metrics = {
122
- # "context_relevance": instance['relevance_score'],
123
- # "context_utilization": instance['utilization_score'],
124
- # "completeness": instance['completeness_score'],
125
- # "adherence": instance['adherence_score']
126
- # }
127
- print(f"Match found in {split_name} split!")
128
- print(f"ID: {instance_id}, Response: {instance_response}")
129
- return instance_response # Return ground truth response immediately
130
-
131
- return None # Return None if no match is found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ from sklearn.metrics import mean_squared_error, roc_auc_score
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ from data_processing import load_query_dataset
8
+
9
+ global ground_truth_answer, ground_truth_metrics
10
+
11
+ ground_truth_answer = ''
12
+ ground_truth_metrics = {}
13
+
14
+ # def calculate_metrics(question, response, docs, time_taken):
15
+ # data = load_ragbench()
16
+ # retrieve_ground_truths(question, data)
17
+ # # Predicted metrics
18
+ # predicted_metrics = {
19
+ # "ground_truth": ground_truth_answer,
20
+ # "context_relevance": context_relevance(question, docs),
21
+ # "context_utilization": context_utilization(response, docs),
22
+ # "completeness": completeness(response, ground_truth_answer),
23
+ # "adherence": adherence(response, docs),
24
+ # "response_time" : time_taken
25
+ # }
26
+ # return predicted_metrics
27
+
28
+ # def retrieve_ground_truths(question,ragbench_set):
29
+ # for dataset_name in ragbench_set.keys():
30
+ # for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
31
+ # print(f"Processing {split_name} split")
32
+ # for instance in instances: # Fixed: Corrected indentation
33
+ # # Check if the question (data) matches the query
34
+ # if instance['question'] == question:
35
+ # # If a match is found, retrieve id and response
36
+ # instance_id = instance['id']
37
+ # instance_response = instance['response']
38
+ # ground_truth_metrics = {
39
+ # "context_relevance": instance['relevance_score'],
40
+ # "context_utilization": instance['utilization_score'],
41
+ # "completeness": instance['completeness_score'],
42
+ # "adherence": instance['adherence_score']
43
+ # }
44
+ # ground_truth_answer = instance_response
45
+ # print(f"Match found in {split_name} split!")
46
+ # print(f"ID: {instance_id}, Response: {instance_response}")
47
+ # break # Exit after finding the first match (optional)
48
+
49
+ # Step 1: Helper function to compute cosine similarity
50
+ def compute_cosine_similarity(text1, text2):
51
+ if not text1 or not text2: # Check for empty or None values
52
+ print("Error: One or both input texts are empty. Returning similarity as 0.")
53
+ return 0.0
54
+
55
+ vectorizer = TfidfVectorizer(stop_words="english")
56
+
57
+ try:
58
+ vectors = vectorizer.fit_transform([text1, text2])
59
+ similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
60
+ return similarity
61
+ except ValueError as e:
62
+ print(f"Error in vectorization: {e}. Returning similarity as 0.")
63
+ return 0.0
64
+
65
+ # Step 2: Metric 1 - Context Relevance
66
+ def context_relevance(question, relevant_documents):
67
+ # combined_docs = " ".join([doc.page_content for doc in relevant_documents])
68
+ combined_docs = " ".join([doc for doc in relevant_documents])
69
+ return compute_cosine_similarity(question, combined_docs)
70
+
71
+ # Step 3: Metric 2 - Context Utilization
72
+ def context_utilization(response, relevant_documents):
73
+ #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
74
+ combined_docs = " ".join([doc for doc in relevant_documents])
75
+ return compute_cosine_similarity(response, combined_docs)
76
+
77
+ # Step 4: Metric 3 - Completeness
78
+ def completeness(response, ground_truth_answer):
79
+ return compute_cosine_similarity(response, ground_truth_answer)
80
+
81
+ # Step 5: Metric 4 - Adherence
82
+ def adherence(response, relevant_documents):
83
+ #combined_docs = " ".join([doc.page_content for doc in relevant_documents])
84
+ combined_docs = " ".join([doc for doc in relevant_documents])
85
+ response_tokens = set(response.split())
86
+ relevant_tokens = set(combined_docs.split())
87
+ supported_tokens = response_tokens.intersection(relevant_tokens)
88
+ return len(supported_tokens) / len(response_tokens)
89
+
90
+ # Step 6: Compute RMSE for metrics
91
+ def compute_rmse(predicted_values, ground_truth_values):
92
+ return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
93
+
94
+ def retrieve_ground_truths(question, dataset):
95
+ """Retrieve the ground truth answer for a given question from the dataset."""
96
+ for split_name, instances in dataset.items():
97
+ for instance in instances:
98
+ if instance['question'] == question:
99
+ return instance['response'] # Return the ground truth response immediately
100
+ return None # Return None if no match is found
101
+
102
+
103
+
104
+
105
+ def calculate_metrics(question, q_dataset, response, docs, time_taken):
106
+ data = load_query_dataset(q_dataset)
107
+ ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
108
+
109
+ # Ensure ground_truth_answer is not empty before proceeding
110
+ if ground_truth_answer is None:
111
+ ground_truth_answer = "" # Default to an empty string if no ground truth is found
112
+
113
+ # Predicted metrics
114
+ predicted_metrics = {
115
+ "RAG_model_response": response,
116
+ "ground_truth": ground_truth_answer,
117
+ "context_relevance": context_relevance(question, docs),
118
+ "context_utilization": context_utilization(response, docs),
119
+ "completeness": completeness(response, ground_truth_answer),
120
+ "adherence": adherence(response, docs),
121
+ "response_time": time_taken
122
+ }
123
+ # If ground_truth_answer and predicted_metrics are available, compute RMSE
124
+ if ground_truth_answer and predicted_metrics:
125
+ # Assuming that we are calculating RMSE for completeness or other relevant metrics
126
+ rmse_value = compute_rmse([predicted_metrics['completeness']], [ground_truth_answer])
127
+ predicted_metrics['rmse'] = rmse_value
128
+ return predicted_metrics
129
+
130
+ ''' def retrieve_ground_truths(question, dataset):
131
+ for split_name, instances in dataset.items():
132
+ print(f"Processing {split_name} split")
133
+ for instance in instances:
134
+ if instance['question'] == question:
135
+ instance_id = instance['id']
136
+ instance_response = instance['response']
137
+ # ground_truth_metrics = {
138
+ # "context_relevance": instance['relevance_score'],
139
+ # "context_utilization": instance['utilization_score'],
140
+ # "completeness": instance['completeness_score'],
141
+ # "adherence": instance['adherence_score']
142
+ # }
143
+ print(f"Match found in {split_name} split!")
144
+ print(f"ID: {instance_id}, Response: {instance_response}")
145
+ return instance_response # Return ground truth response immediately
146
+
147
+ return None # Return None if no match is found
148
+ '''