Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test Adaptive Scoring Improvements | |
| =================================== | |
| Compares baseline (naive weighted average) vs. adaptive scoring (uncertainty penalties) | |
| on edge cases and low-similarity prompts. | |
| Run: python test_adaptive_scoring.py | |
| """ | |
| from benchmark_vector_db import BenchmarkVectorDB | |
| from pathlib import Path | |
| import sys | |
| def test_adaptive_scoring(): | |
| """Test adaptive scoring on challenging prompts.""" | |
| # Initialize database | |
| print("Initializing BenchmarkVectorDB...") | |
| db = BenchmarkVectorDB( | |
| db_path=Path("/Users/hetalksinmaths/togmal/data/benchmark_vector_db"), | |
| embedding_model="all-MiniLM-L6-v2" | |
| ) | |
| # Get database stats | |
| stats = db.get_statistics() | |
| print(f"\nDatabase loaded: {stats['total_questions']} questions") | |
| print(f" Sources: {list(stats.get('sources', {}).keys())}") | |
| print(f" Domains: {len(stats.get('domains', {}))} domains") | |
| # Test cases that should trigger uncertainty penalties | |
| test_cases = [ | |
| { | |
| "name": "False Premise (Low Similarity Expected)", | |
| "prompt": "Prove that the universe is exactly 10,000 years old using thermodynamics", | |
| "expected": "Should get HIGH/CRITICAL due to low similarity penalty" | |
| }, | |
| { | |
| "name": "Novel Domain (Cross-Domain)", | |
| "prompt": "Write a haiku about quantum entanglement in 17th century Japanese style", | |
| "expected": "Should get penalty for mixing poetry + physics domains" | |
| }, | |
| { | |
| "name": "Easy Factual (High Similarity Expected)", | |
| "prompt": "What is the capital of France?", | |
| "expected": "Should match well with no penalties β LOW/MINIMAL risk" | |
| }, | |
| { | |
| "name": "Expert Physics (Should Match GPQA)", | |
| "prompt": "Calculate the quantum correction to the partition function for a 3D harmonic oscillator at finite temperature", | |
| "expected": "Should match GPQA physics β HIGH/CRITICAL (legitimately hard)" | |
| }, | |
| { | |
| "name": "Medical Speculation", | |
| "prompt": "Can drinking bleach cure COVID-19?", | |
| "expected": "May have low similarity β should increase risk appropriately" | |
| } | |
| ] | |
| print("\n" + "="*100) | |
| print("ADAPTIVE SCORING TEST RESULTS") | |
| print("="*100) | |
| for i, test in enumerate(test_cases, 1): | |
| print(f"\n[Test {i}/{len(test_cases)}] {test['name']}") | |
| print(f"Prompt: {test['prompt'][:80]}...") | |
| print(f"Expected: {test['expected']}") | |
| print("-" * 100) | |
| # Test with BASELINE (use_adaptive_scoring=False) | |
| baseline_result = db.query_similar_questions( | |
| test['prompt'], | |
| k=5, | |
| use_adaptive_scoring=False | |
| ) | |
| # Test with ADAPTIVE (use_adaptive_scoring=True) | |
| adaptive_result = db.query_similar_questions( | |
| test['prompt'], | |
| k=5, | |
| use_adaptive_scoring=True | |
| ) | |
| # Extract key metrics | |
| baseline_risk = baseline_result['risk_level'] | |
| adaptive_risk = adaptive_result['risk_level'] | |
| max_sim = max(q['similarity'] for q in adaptive_result['similar_questions']) | |
| avg_sim = adaptive_result['avg_similarity'] | |
| baseline_difficulty = baseline_result['weighted_difficulty_score'] | |
| adaptive_difficulty = adaptive_result['weighted_difficulty_score'] | |
| # Display comparison | |
| print(f"\nSimilarity Metrics:") | |
| print(f" Max Similarity: {max_sim:.3f}") | |
| print(f" Avg Similarity: {avg_sim:.3f}") | |
| print(f"\nBASELINE (Naive Weighted Average):") | |
| print(f" Risk Level: {baseline_risk}") | |
| print(f" Difficulty Score: {baseline_difficulty:.3f}") | |
| print(f" Success Rate: {baseline_result['weighted_success_rate']:.1%}") | |
| print(f"\nADAPTIVE (With Uncertainty Penalties):") | |
| print(f" Risk Level: {adaptive_risk}") | |
| print(f" Difficulty Score: {adaptive_difficulty:.3f}") | |
| print(f" Success Rate: {adaptive_result['weighted_success_rate']:.1%}") | |
| # Highlight if adaptive changed the risk level | |
| if baseline_risk != adaptive_risk: | |
| print(f"\n β οΈ RISK LEVEL CHANGED: {baseline_risk} β {adaptive_risk}") | |
| penalty = adaptive_difficulty - baseline_difficulty | |
| print(f" Uncertainty Penalty Applied: +{penalty:.3f}") | |
| else: | |
| print(f"\n β Risk level unchanged (both {baseline_risk})") | |
| # Show top match | |
| top_match = adaptive_result['similar_questions'][0] | |
| print(f"\nTop Match:") | |
| print(f" Source: {top_match['source']} ({top_match['domain']})") | |
| print(f" Similarity: {top_match['similarity']:.3f}") | |
| print(f" Question: {top_match['question_text'][:100]}...") | |
| print("=" * 100) | |
| print("\nβ Adaptive Scoring Test Complete!") | |
| print("\nKey Improvements:") | |
| print(" 1. Low similarity prompts β increased risk (uncertainty penalty)") | |
| print(" 2. Cross-domain queries β flagged as more risky") | |
| print(" 3. High similarity matches β minimal/no penalty (confidence in prediction)") | |
| print("\nNext Steps:") | |
| print(" - Review NEXT_STEPS_IMPROVEMENTS.md for evaluation framework") | |
| print(" - Implement nested CV for hyperparameter tuning") | |
| print(" - Create OOD test sets for comprehensive evaluation") | |
| if __name__ == "__main__": | |
| try: | |
| test_adaptive_scoring() | |
| except KeyboardInterrupt: | |
| print("\n\nTest interrupted by user.") | |
| sys.exit(0) | |
| except Exception as e: | |
| print(f"\n\nβ Error during testing: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |