Spaces:
Build error
Build error
Upload 2 files
Browse files- app.py +1 -1
- evaluation.py +1 -36
app.py
CHANGED
|
@@ -49,7 +49,7 @@ if st.session_state.recent_questions:
|
|
| 49 |
# Display Recent Questions
|
| 50 |
st.sidebar.title("Overall RMSE")
|
| 51 |
rmse_values = [q["metrics"]["RMSE"] for q in recent_qns if "metrics" in q and "RMSE" in q["metrics"]]
|
| 52 |
-
if any(rmse_values):
|
| 53 |
average_rmse = sum(rmse_values) / len(rmse_values) if rmse_values else 0
|
| 54 |
st.sidebar.write(f"📊 **Average RMSE:** {average_rmse:.4f} for {len(rmse_values)} questions")
|
| 55 |
|
|
|
|
| 49 |
# Display Recent Questions
|
| 50 |
st.sidebar.title("Overall RMSE")
|
| 51 |
rmse_values = [q["metrics"]["RMSE"] for q in recent_qns if "metrics" in q and "RMSE" in q["metrics"]]
|
| 52 |
+
if any(rmse_values) and len(rmse_values) > 0:
|
| 53 |
average_rmse = sum(rmse_values) / len(rmse_values) if rmse_values else 0
|
| 54 |
st.sidebar.write(f"📊 **Average RMSE:** {average_rmse:.4f} for {len(rmse_values)} questions")
|
| 55 |
|
evaluation.py
CHANGED
|
@@ -11,41 +11,6 @@ global ground_truth_answer, ground_truth_metrics
|
|
| 11 |
ground_truth_answer = ''
|
| 12 |
ground_truth_metrics = {}
|
| 13 |
|
| 14 |
-
# def calculate_metrics(question, response, docs, time_taken):
|
| 15 |
-
# data = load_ragbench()
|
| 16 |
-
# retrieve_ground_truths(question, data)
|
| 17 |
-
# # Predicted metrics
|
| 18 |
-
# predicted_metrics = {
|
| 19 |
-
# "ground_truth": ground_truth_answer,
|
| 20 |
-
# "context_relevance": context_relevance(question, docs),
|
| 21 |
-
# "context_utilization": context_utilization(response, docs),
|
| 22 |
-
# "completeness": completeness(response, ground_truth_answer),
|
| 23 |
-
# "adherence": adherence(response, docs),
|
| 24 |
-
# "response_time" : time_taken
|
| 25 |
-
# }
|
| 26 |
-
# return predicted_metrics
|
| 27 |
-
|
| 28 |
-
# def retrieve_ground_truths(question,ragbench_set):
|
| 29 |
-
# for dataset_name in ragbench_set.keys():
|
| 30 |
-
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
| 31 |
-
# print(f"Processing {split_name} split")
|
| 32 |
-
# for instance in instances: # Fixed: Corrected indentation
|
| 33 |
-
# # Check if the question (data) matches the query
|
| 34 |
-
# if instance['question'] == question:
|
| 35 |
-
# # If a match is found, retrieve id and response
|
| 36 |
-
# instance_id = instance['id']
|
| 37 |
-
# instance_response = instance['response']
|
| 38 |
-
# ground_truth_metrics = {
|
| 39 |
-
# "context_relevance": instance['relevance_score'],
|
| 40 |
-
# "context_utilization": instance['utilization_score'],
|
| 41 |
-
# "completeness": instance['completeness_score'],
|
| 42 |
-
# "adherence": instance['adherence_score']
|
| 43 |
-
# }
|
| 44 |
-
# ground_truth_answer = instance_response
|
| 45 |
-
# print(f"Match found in {split_name} split!")
|
| 46 |
-
# print(f"ID: {instance_id}, Response: {instance_response}")
|
| 47 |
-
# break # Exit after finding the first match (optional)
|
| 48 |
-
|
| 49 |
# Step 1: Helper function to compute cosine similarity
|
| 50 |
def compute_cosine_similarity(text1, text2):
|
| 51 |
if not text1 or not text2: # Check for empty or None values
|
|
@@ -109,7 +74,7 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
| 109 |
"adherence": adherence(response, docs),
|
| 110 |
}
|
| 111 |
|
| 112 |
-
rmse = compute_rmse(predicted_metrics, ground_truth_metrics)
|
| 113 |
|
| 114 |
metrics = {
|
| 115 |
"RMSE": rmse,
|
|
|
|
| 11 |
ground_truth_answer = ''
|
| 12 |
ground_truth_metrics = {}
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# Step 1: Helper function to compute cosine similarity
|
| 15 |
def compute_cosine_similarity(text1, text2):
|
| 16 |
if not text1 or not text2: # Check for empty or None values
|
|
|
|
| 74 |
"adherence": adherence(response, docs),
|
| 75 |
}
|
| 76 |
|
| 77 |
+
rmse = compute_rmse(predicted_metrics, ground_truth_metrics)
|
| 78 |
|
| 79 |
metrics = {
|
| 80 |
"RMSE": rmse,
|