Andy Lee commited on
Commit
78ec24e
Β·
1 Parent(s): 6c8b7ac

fix: keep simple, and use hf_token for qwen

Browse files
Files changed (3) hide show
  1. app.py +114 -352
  2. config.py +7 -19
  3. hf_chat.py +2 -2
app.py CHANGED
@@ -4,120 +4,41 @@ import os
4
  import time
5
  from io import BytesIO
6
  from PIL import Image
7
- from typing import Dict, List, Any
8
  from pathlib import Path
9
 
10
- # Import core logic and configurations from the project
11
- from geo_bot import (
12
- GeoBot,
13
- AGENT_PROMPT_TEMPLATE,
14
- BENCHMARK_PROMPT,
15
- )
16
  from benchmark import MapGuesserBenchmark
17
  from config import MODELS_CONFIG, get_data_paths, SUCCESS_THRESHOLD_KM
18
  from langchain_openai import ChatOpenAI
19
  from langchain_anthropic import ChatAnthropic
20
  from langchain_google_genai import ChatGoogleGenerativeAI
21
-
22
  from hf_chat import HuggingFaceChat
23
 
24
-
25
- def setup_api_keys():
26
- """Setup API keys from Streamlit secrets and show status"""
27
- key_status = {}
28
-
29
- # OpenAI
30
- openai_key = st.secrets.get("OPENAI_API_KEY", "")
31
- if openai_key:
32
- os.environ["OPENAI_API_KEY"] = openai_key
33
- key_status["OpenAI"] = "βœ… Available"
34
- else:
35
- key_status["OpenAI"] = "❌ Missing"
36
-
37
- # Anthropic
38
- anthropic_key = st.secrets.get("ANTHROPIC_API_KEY", "")
39
- if anthropic_key:
40
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key
41
- key_status["Anthropic"] = "βœ… Available"
42
- else:
43
- key_status["Anthropic"] = "❌ Missing"
44
-
45
- # Google
46
- google_key = st.secrets.get("GOOGLE_API_KEY", "")
47
- if google_key:
48
- os.environ["GOOGLE_API_KEY"] = google_key
49
- key_status["Google"] = "βœ… Available"
50
- else:
51
- key_status["Google"] = "❌ Missing"
52
-
53
- # HuggingFace
54
- hf_key = st.secrets.get("HUGGINGFACE_API_KEY", "")
55
- if hf_key:
56
- os.environ["HUGGINGFACE_API_KEY"] = hf_key
57
- key_status["HuggingFace"] = "βœ… Available"
58
- else:
59
- key_status["HuggingFace"] = "❌ Missing"
60
-
61
- return key_status
62
-
63
-
64
- def get_available_models(key_status):
65
- """Get available models based on API key status"""
66
- available_models = {}
67
-
68
- for model_id, config in MODELS_CONFIG.items():
69
- api_key_env = config["api_key_env"]
70
-
71
- # Check if required API key is available
72
- if (
73
- api_key_env == "OPENAI_API_KEY"
74
- and "OpenAI" in key_status
75
- and "βœ…" in key_status["OpenAI"]
76
- ):
77
- available_models[model_id] = config
78
- elif (
79
- api_key_env == "ANTHROPIC_API_KEY"
80
- and "Anthropic" in key_status
81
- and "βœ…" in key_status["Anthropic"]
82
- ):
83
- available_models[model_id] = config
84
- elif (
85
- api_key_env == "GOOGLE_API_KEY"
86
- and "Google" in key_status
87
- and "βœ…" in key_status["Google"]
88
- ):
89
- available_models[model_id] = config
90
- elif (
91
- api_key_env == "HUGGINGFACE_API_KEY"
92
- and "HuggingFace" in key_status
93
- and "βœ…" in key_status["HuggingFace"]
94
- ):
95
- if HuggingFaceChat is not None: # Only if wrapper is available
96
- available_models[model_id] = config
97
-
98
- return available_models
99
 
100
 
101
  def get_available_datasets():
102
- """Get list of available datasets"""
103
  datasets_dir = Path("datasets")
104
  if not datasets_dir.exists():
105
  return ["default"]
106
-
107
  datasets = []
108
  for dataset_dir in datasets_dir.iterdir():
109
  if dataset_dir.is_dir():
110
- dataset_name = dataset_dir.name
111
- data_paths = get_data_paths(dataset_name)
112
  if os.path.exists(data_paths["golden_labels"]):
113
- datasets.append(dataset_name)
114
-
115
  return datasets if datasets else ["default"]
116
 
117
 
118
- def get_model_class(model_config):
119
- """Get the appropriate model class based on config"""
120
- class_name = model_config["class"]
121
  if class_name == "ChatOpenAI":
122
  return ChatOpenAI
123
  elif class_name == "ChatAnthropic":
@@ -130,185 +51,84 @@ def get_model_class(model_config):
130
  raise ValueError(f"Unknown model class: {class_name}")
131
 
132
 
133
- # --- Page UI Setup ---
134
  st.set_page_config(page_title="MapCrunch AI Agent", layout="wide")
135
  st.title("πŸ—ΊοΈ MapCrunch AI Agent")
136
- st.caption(
137
- "An AI agent that explores and identifies geographic locations through multi-step interaction."
138
- )
139
 
140
- # Setup API keys and check status
141
- key_status = setup_api_keys()
142
- available_models = get_available_models(key_status)
143
-
144
- # --- Sidebar for Configuration ---
145
  with st.sidebar:
146
- st.header("βš™οΈ Agent Configuration")
147
-
148
- # Show API key status
149
- with st.expander("πŸ”‘ API Key Status", expanded=False):
150
- for provider, status in key_status.items():
151
- st.text(f"{provider}: {status}")
152
 
153
- if not any("βœ…" in status for status in key_status.values()):
154
- st.error(
155
- "⚠️ No API keys configured! Please set up API keys in HF Spaces secrets."
156
- )
157
- st.info(
158
- "Add these secrets in your Space settings:\n- OPENAI_API_KEY\n- ANTHROPIC_API_KEY\n- GOOGLE_API_KEY\n- HUGGINGFACE_API_KEY"
159
- )
160
 
161
- # Dataset selection
162
- available_datasets = get_available_datasets()
163
- dataset_choice = st.selectbox("Select Dataset", available_datasets)
164
-
165
- # Model selection (only show available models)
166
- if not available_models:
167
- st.error("❌ No models available! Please configure API keys.")
168
- st.stop()
169
-
170
- model_options = {
171
- model_id: f"{model_id} - {config['description']}"
172
- for model_id, config in available_models.items()
173
- }
174
- model_choice = st.selectbox(
175
- "Select AI Model",
176
- list(model_options.keys()),
177
- format_func=lambda x: model_options[x],
178
- )
179
 
180
- steps_per_sample = st.slider(
181
- "Max Exploration Steps per Sample", min_value=3, max_value=20, value=10
 
182
  )
183
 
184
- # Load golden labels for selected dataset
185
- data_paths = get_data_paths(dataset_choice)
186
- try:
187
- with open(data_paths["golden_labels"], "r", encoding="utf-8") as f:
188
- golden_labels = json.load(f).get("samples", [])
189
- total_samples = len(golden_labels)
190
-
191
- st.info(f"Dataset '{dataset_choice}' has {total_samples} samples")
192
-
193
- num_samples_to_run = st.slider(
194
- "Number of Samples to Test",
195
- min_value=1,
196
- max_value=total_samples,
197
- value=min(3, total_samples),
198
- )
199
- except FileNotFoundError:
200
- st.error(
201
- f"Dataset '{dataset_choice}' not found at {data_paths['golden_labels']}. Please create the dataset first."
202
- )
203
- golden_labels = []
204
- num_samples_to_run = 0
205
-
206
- start_button = st.button(
207
- "πŸš€ Start Agent Benchmark", disabled=(num_samples_to_run == 0), type="primary"
208
- )
209
 
210
- # --- Agent Execution Logic ---
211
  if start_button:
212
- # Prepare the environment
213
- test_samples = golden_labels[:num_samples_to_run]
214
-
215
- config = available_models.get(model_choice)
216
- if not config:
217
- st.error(f"Model {model_choice} is not available!")
218
- st.stop()
219
-
220
- try:
221
- model_class = get_model_class(config)
222
- model_instance_name = config["model_name"]
223
- except Exception as e:
224
- st.error(f"Failed to load model class: {e}")
225
- st.stop()
226
-
227
- # Initialize helpers and result lists
228
  benchmark_helper = MapGuesserBenchmark(dataset_name=dataset_choice)
229
  all_results = []
230
 
231
- st.info(
232
- f"Starting Agent Benchmark... Dataset: {dataset_choice}, Model: {model_choice}, Steps: {steps_per_sample}, Samples: {num_samples_to_run}"
233
- )
234
 
235
- overall_progress_bar = st.progress(0, text="Overall Progress")
 
 
 
 
 
236
 
237
- # Initialize the bot outside the loop to reuse the browser instance for efficiency
238
- with st.spinner("Initializing browser and AI model..."):
239
- try:
240
- # Note: Must run in headless mode on HF Spaces
241
- bot = GeoBot(
242
- model=model_class, model_name=model_instance_name, headless=True
243
- )
244
- except Exception as e:
245
- st.error(f"Failed to initialize model: {e}")
246
- st.info("This might be due to API key issues or model unavailability.")
247
- st.stop()
248
-
249
- # Main loop to iterate through all selected test samples
250
- for i, sample in enumerate(test_samples):
251
- sample_id = sample.get("id", "N/A")
252
- st.divider()
253
- st.header(f"▢️ Running Sample {i + 1}/{num_samples_to_run} (ID: {sample_id})")
254
-
255
- if not bot.controller.load_location_from_data(sample):
256
- st.error(f"Failed to load location for sample {sample_id}. Skipping.")
257
- continue
258
-
259
- bot.controller.setup_clean_environment()
260
-
261
- # Create the visualization layout for the current sample
262
- col1, col2 = st.columns([2, 3])
263
- with col1:
264
- image_placeholder = st.empty()
265
- with col2:
266
- reasoning_placeholder = st.empty()
267
- action_placeholder = st.empty()
268
-
269
- # --- Inner agent exploration loop ---
270
- history = []
271
- final_guess = None
272
-
273
- for step in range(steps_per_sample):
274
- step_num = step + 1
275
- unique_step_id = f"sample_{i}_step_{step_num}" # Unique identifier
276
-
277
- reasoning_placeholder.info(
278
- f"πŸ€” Thinking... (Step {step_num}/{steps_per_sample})"
279
- )
280
- action_placeholder.empty()
281
 
282
- try:
283
- # Observe and label arrows
284
  bot.controller.label_arrows_on_screen()
285
  screenshot_bytes = bot.controller.take_street_view_screenshot()
 
286
 
287
- # Current view
288
- image_placeholder.image(
289
- screenshot_bytes,
290
- caption=f"πŸ” Step {step_num} - What AI Sees Now",
291
- use_column_width=True,
292
- )
293
-
294
- # Update history
295
- current_step_data = {
296
  "image_b64": bot.pil_to_base64(
297
  Image.open(BytesIO(screenshot_bytes))
298
  ),
299
  "action": "N/A",
300
- "screenshot_bytes": screenshot_bytes,
301
- "step_num": step_num,
302
  }
303
- history.append(current_step_data)
304
 
305
- # Think
306
  available_actions = bot.controller.get_available_actions()
307
  history_text = "\n".join(
308
  [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history[:-1])]
309
  )
310
  if not history_text:
311
- history_text = "No history yet. This is the first step."
312
 
313
  prompt = AGENT_PROMPT_TEMPLATE.format(
314
  remaining_steps=steps_per_sample - step,
@@ -316,150 +136,92 @@ if start_button:
316
  available_actions=json.dumps(available_actions),
317
  )
318
 
319
- # Show what AI is considering
320
- with reasoning_placeholder:
321
- st.info("🧠 **AI is analyzing the situation...**")
322
- with st.expander("πŸ” Available Actions", expanded=False):
323
- st.json(available_actions)
324
-
325
- # Only show context if there's meaningful history
326
- if len(history) > 1:
327
- with st.expander("πŸ“ Previous Steps", expanded=False):
328
- for j, h in enumerate(history[:-1]):
329
- st.write(f"Step {j + 1}: {h.get('action', 'N/A')}")
330
-
331
  message = bot._create_message_with_history(
332
  prompt, [h["image_b64"] for h in history]
333
  )
334
-
335
- # Get AI response
336
  response = bot.model.invoke(message)
337
  decision = bot._parse_agent_response(response)
338
 
339
- if not decision: # Fallback
340
  decision = {
341
  "action_details": {"action": "PAN_RIGHT"},
342
- "reasoning": "⚠️ Response parsing failed. Using default recovery action.",
343
  }
344
 
345
  action = decision.get("action_details", {}).get("action")
346
  history[-1]["action"] = action
347
- history[-1]["reasoning"] = decision.get("reasoning", "N/A")
348
 
349
- # Display AI's decision
350
- reasoning_placeholder.success("βœ… **AI Decision Made!**")
351
 
352
  with action_placeholder:
353
- st.success(f"🎯 **AI Action:** `{action}`")
 
354
 
355
- # Show reasoning in expandable section
356
- with st.expander("🧠 AI's Reasoning", expanded=True):
357
- st.info(decision.get("reasoning", "N/A"))
358
-
359
- if action == "GUESS":
360
- lat = decision.get("action_details", {}).get("lat")
361
- lon = decision.get("action_details", {}).get("lon")
362
- if lat and lon:
363
- st.success(f"πŸ“ **Final Guess:** {lat:.4f}, {lon:.4f}")
364
-
365
- # Force a GUESS on the last step
366
  if step_num == steps_per_sample and action != "GUESS":
367
- st.warning("⏰ Max steps reached. Forcing a GUESS action.")
368
  action = "GUESS"
369
 
370
- # Act
371
  if action == "GUESS":
372
- lat, lon = (
373
- decision.get("action_details", {}).get("lat"),
374
- decision.get("action_details", {}).get("lon"),
375
- )
376
  if lat is not None and lon is not None:
377
  final_guess = (lat, lon)
378
- else:
379
- st.error(
380
- "❌ GUESS action was missing coordinates. Guess failed for this sample."
381
- )
382
- break # End exploration for the current sample
383
-
384
  elif action == "MOVE_FORWARD":
385
- with st.spinner("🚢 Moving forward..."):
386
- bot.controller.move("forward")
387
  elif action == "MOVE_BACKWARD":
388
- with st.spinner("πŸ”„ Moving backward..."):
389
- bot.controller.move("backward")
390
  elif action == "PAN_LEFT":
391
- with st.spinner("⬅️ Panning left..."):
392
- bot.controller.pan_view("left")
393
  elif action == "PAN_RIGHT":
394
- with st.spinner("➑️ Panning right..."):
395
- bot.controller.pan_view("right")
396
 
397
- time.sleep(1) # A brief pause between steps
398
 
399
- except Exception as e:
400
- st.error(f"Error in step {step_num}: {e}")
401
- break
 
402
 
403
- # --- End of single sample run, calculate and display results ---
404
- true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
405
- distance_km = None
406
- is_success = False
407
-
408
- if final_guess:
409
- distance_km = benchmark_helper.calculate_distance(true_coords, final_guess)
410
- if distance_km is not None:
411
- is_success = distance_km <= SUCCESS_THRESHOLD_KM
 
 
 
 
 
 
 
 
 
412
 
413
- st.subheader("🎯 Round Result")
414
- res_col1, res_col2, res_col3 = st.columns(3)
415
- res_col1.metric(
416
- "Final Guess (Lat, Lon)", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
417
- )
418
- res_col2.metric(
419
- "Ground Truth (Lat, Lon)",
420
- f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
421
- )
422
- res_col3.metric(
423
- "Distance Error",
424
- f"{distance_km:.1f} km" if distance_km is not None else "N/A",
425
- delta=f"{'Success' if is_success else 'Failure'}",
426
- delta_color=("inverse" if is_success else "off"),
427
  )
428
- else:
429
- st.error("Agent failed to make a final guess.")
430
-
431
- all_results.append(
432
- {
433
- "sample_id": sample_id,
434
- "model": model_choice,
435
- "true_coordinates": true_coords,
436
- "predicted_coordinates": final_guess,
437
- "distance_km": distance_km,
438
- "success": is_success,
439
- }
440
- )
441
-
442
- # Update overall progress bar
443
- overall_progress_bar.progress(
444
- (i + 1) / num_samples_to_run,
445
- text=f"Overall Progress: {i + 1}/{num_samples_to_run}",
446
- )
447
-
448
- # --- End of all samples, display final summary ---
449
- bot.close() # Close the browser
450
- st.divider()
451
- st.header("🏁 Benchmark Summary")
452
 
 
 
 
 
 
453
  summary = benchmark_helper.generate_summary(all_results)
454
  if summary and model_choice in summary:
455
  stats = summary[model_choice]
456
- sum_col1, sum_col2 = st.columns(2)
457
- sum_col1.metric(
458
- "Overall Success Rate", f"{stats.get('success_rate', 0) * 100:.1f} %"
459
- )
460
- sum_col2.metric(
461
- "Average Distance Error", f"{stats.get('average_distance_km', 0):.1f} km"
462
- )
463
- st.dataframe(all_results) # Display the detailed results table
464
- else:
465
- st.warning("Not enough results to generate a summary.")
 
4
  import time
5
  from io import BytesIO
6
  from PIL import Image
 
7
  from pathlib import Path
8
 
9
+ from geo_bot import GeoBot, AGENT_PROMPT_TEMPLATE
 
 
 
 
 
10
  from benchmark import MapGuesserBenchmark
11
  from config import MODELS_CONFIG, get_data_paths, SUCCESS_THRESHOLD_KM
12
  from langchain_openai import ChatOpenAI
13
  from langchain_anthropic import ChatAnthropic
14
  from langchain_google_genai import ChatGoogleGenerativeAI
 
15
  from hf_chat import HuggingFaceChat
16
 
17
+ # Simple API key setup
18
+ if "OPENAI_API_KEY" in st.secrets:
19
+ os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
20
+ if "ANTHROPIC_API_KEY" in st.secrets:
21
+ os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"]
22
+ if "GOOGLE_API_KEY" in st.secrets:
23
+ os.environ["GOOGLE_API_KEY"] = st.secrets["GOOGLE_API_KEY"]
24
+ if "HF_TOKEN" in st.secrets:
25
+ os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def get_available_datasets():
 
29
  datasets_dir = Path("datasets")
30
  if not datasets_dir.exists():
31
  return ["default"]
 
32
  datasets = []
33
  for dataset_dir in datasets_dir.iterdir():
34
  if dataset_dir.is_dir():
35
+ data_paths = get_data_paths(dataset_dir.name)
 
36
  if os.path.exists(data_paths["golden_labels"]):
37
+ datasets.append(dataset_dir.name)
 
38
  return datasets if datasets else ["default"]
39
 
40
 
41
+ def get_model_class(class_name):
 
 
42
  if class_name == "ChatOpenAI":
43
  return ChatOpenAI
44
  elif class_name == "ChatAnthropic":
 
51
  raise ValueError(f"Unknown model class: {class_name}")
52
 
53
 
54
+ # UI Setup
55
  st.set_page_config(page_title="MapCrunch AI Agent", layout="wide")
56
  st.title("πŸ—ΊοΈ MapCrunch AI Agent")
 
 
 
57
 
58
+ # Sidebar
 
 
 
 
59
  with st.sidebar:
60
+ st.header("βš™οΈ Configuration")
 
 
 
 
 
61
 
62
+ dataset_choice = st.selectbox("Dataset", get_available_datasets())
63
+ model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
64
+ steps_per_sample = st.slider("Max Steps", 3, 20, 10)
 
 
 
 
65
 
66
+ # Load dataset
67
+ data_paths = get_data_paths(dataset_choice)
68
+ with open(data_paths["golden_labels"], "r") as f:
69
+ golden_labels = json.load(f).get("samples", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ st.info(f"Dataset has {len(golden_labels)} samples")
72
+ num_samples = st.slider(
73
+ "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
74
  )
75
 
76
+ start_button = st.button("πŸš€ Start", type="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Main Logic
79
  if start_button:
80
+ test_samples = golden_labels[:num_samples]
81
+ config = MODELS_CONFIG[model_choice]
82
+ model_class = get_model_class(config["class"])
83
+
 
 
 
 
 
 
 
 
 
 
 
 
84
  benchmark_helper = MapGuesserBenchmark(dataset_name=dataset_choice)
85
  all_results = []
86
 
87
+ progress_bar = st.progress(0)
 
 
88
 
89
+ with GeoBot(
90
+ model=model_class, model_name=config["model_name"], headless=True
91
+ ) as bot:
92
+ for i, sample in enumerate(test_samples):
93
+ st.divider()
94
+ st.header(f"Sample {i + 1}/{num_samples}")
95
 
96
+ bot.controller.load_location_from_data(sample)
97
+ bot.controller.setup_clean_environment()
98
+
99
+ col1, col2 = st.columns([2, 3])
100
+
101
+ with col1:
102
+ image_placeholder = st.empty()
103
+ with col2:
104
+ reasoning_placeholder = st.empty()
105
+ action_placeholder = st.empty()
106
+
107
+ history = []
108
+ final_guess = None
109
+
110
+ for step in range(steps_per_sample):
111
+ step_num = step + 1
112
+ reasoning_placeholder.info(f"πŸ€” Step {step_num}/{steps_per_sample}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
 
 
114
  bot.controller.label_arrows_on_screen()
115
  screenshot_bytes = bot.controller.take_street_view_screenshot()
116
+ image_placeholder.image(screenshot_bytes, caption=f"Step {step_num}")
117
 
118
+ current_step = {
 
 
 
 
 
 
 
 
119
  "image_b64": bot.pil_to_base64(
120
  Image.open(BytesIO(screenshot_bytes))
121
  ),
122
  "action": "N/A",
 
 
123
  }
124
+ history.append(current_step)
125
 
 
126
  available_actions = bot.controller.get_available_actions()
127
  history_text = "\n".join(
128
  [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history[:-1])]
129
  )
130
  if not history_text:
131
+ history_text = "First step."
132
 
133
  prompt = AGENT_PROMPT_TEMPLATE.format(
134
  remaining_steps=steps_per_sample - step,
 
136
  available_actions=json.dumps(available_actions),
137
  )
138
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  message = bot._create_message_with_history(
140
  prompt, [h["image_b64"] for h in history]
141
  )
 
 
142
  response = bot.model.invoke(message)
143
  decision = bot._parse_agent_response(response)
144
 
145
+ if not decision:
146
  decision = {
147
  "action_details": {"action": "PAN_RIGHT"},
148
+ "reasoning": "Fallback",
149
  }
150
 
151
  action = decision.get("action_details", {}).get("action")
152
  history[-1]["action"] = action
 
153
 
154
+ reasoning_placeholder.success("βœ… Decision Made")
155
+ action_placeholder.success(f"🎯 Action: `{action}`")
156
 
157
  with action_placeholder:
158
+ with st.expander("Reasoning"):
159
+ st.write(decision.get("reasoning", "N/A"))
160
 
 
 
 
 
 
 
 
 
 
 
 
161
  if step_num == steps_per_sample and action != "GUESS":
 
162
  action = "GUESS"
163
 
 
164
  if action == "GUESS":
165
+ lat = decision.get("action_details", {}).get("lat")
166
+ lon = decision.get("action_details", {}).get("lon")
 
 
167
  if lat is not None and lon is not None:
168
  final_guess = (lat, lon)
169
+ break
 
 
 
 
 
170
  elif action == "MOVE_FORWARD":
171
+ bot.controller.move("forward")
 
172
  elif action == "MOVE_BACKWARD":
173
+ bot.controller.move("backward")
 
174
  elif action == "PAN_LEFT":
175
+ bot.controller.pan_view("left")
 
176
  elif action == "PAN_RIGHT":
177
+ bot.controller.pan_view("right")
 
178
 
179
+ time.sleep(1)
180
 
181
+ # Results
182
+ true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
183
+ distance_km = None
184
+ is_success = False
185
 
186
+ if final_guess:
187
+ distance_km = benchmark_helper.calculate_distance(
188
+ true_coords, final_guess
189
+ )
190
+ if distance_km is not None:
191
+ is_success = distance_km <= SUCCESS_THRESHOLD_KM
192
+
193
+ st.subheader("🎯 Result")
194
+ col1, col2, col3 = st.columns(3)
195
+ col1.metric("Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}")
196
+ col2.metric(
197
+ "Truth", f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}"
198
+ )
199
+ col3.metric(
200
+ "Distance",
201
+ f"{distance_km:.1f} km",
202
+ delta="Success" if is_success else "Failed",
203
+ )
204
 
205
+ all_results.append(
206
+ {
207
+ "sample_id": sample.get("id"),
208
+ "model": model_choice,
209
+ "true_coordinates": true_coords,
210
+ "predicted_coordinates": final_guess,
211
+ "distance_km": distance_km,
212
+ "success": is_success,
213
+ }
 
 
 
 
 
214
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ progress_bar.progress((i + 1) / num_samples)
217
+
218
+ # Summary
219
+ st.divider()
220
+ st.header("🏁 Summary")
221
  summary = benchmark_helper.generate_summary(all_results)
222
  if summary and model_choice in summary:
223
  stats = summary[model_choice]
224
+ col1, col2 = st.columns(2)
225
+ col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
226
+ col2.metric("Avg Distance", f"{stats.get('average_distance_km', 0):.1f} km")
227
+ st.dataframe(all_results)
 
 
 
 
 
 
config.py CHANGED
@@ -31,44 +31,32 @@ MODELS_CONFIG = {
31
  "gpt-4o": {
32
  "class": "ChatOpenAI",
33
  "model_name": "gpt-4o",
34
- "api_key_env": "OPENAI_API_KEY",
35
  "description": "OpenAI GPT-4o",
36
  },
37
  "gpt-4o-mini": {
38
  "class": "ChatOpenAI",
39
  "model_name": "gpt-4o-mini",
40
- "api_key_env": "OPENAI_API_KEY",
41
- "description": "OpenAI GPT-4o Mini (cheaper)",
42
  },
43
  "claude-3.5-sonnet": {
44
  "class": "ChatAnthropic",
45
  "model_name": "claude-3-5-sonnet-20240620",
46
- "api_key_env": "ANTHROPIC_API_KEY",
47
  "description": "Anthropic Claude 3.5 Sonnet",
48
  },
49
  "gemini-1.5-pro": {
50
  "class": "ChatGoogleGenerativeAI",
51
  "model_name": "gemini-1.5-pro-latest",
52
- "api_key_env": "GOOGLE_API_KEY",
53
  "description": "Google Gemini 1.5 Pro",
54
  },
55
- "gemini-2.5-pro": {
56
- "class": "ChatGoogleGenerativeAI",
57
- "model_name": "gemini-2.5-pro-preview-06-05",
58
- "api_key_env": "GOOGLE_API_KEY",
59
- "description": "Google Gemini 2.5 Pro",
60
- },
61
- "qwen2-vl-72b": {
62
  "class": "HuggingFaceChat",
63
- "model_name": "Qwen/Qwen2-VL-72B-Instruct",
64
- "api_key_env": "HUGGINGFACE_API_KEY",
65
- "description": "Qwen2-VL 72B (via HF Inference API)",
66
  },
67
- "qwen2-vl-7b": {
68
  "class": "HuggingFaceChat",
69
- "model_name": "Qwen/Qwen2-VL-7B-Instruct",
70
- "api_key_env": "HUGGINGFACE_API_KEY",
71
- "description": "Qwen2-VL 7B (via HF Inference API)",
72
  },
73
  }
74
 
 
31
  "gpt-4o": {
32
  "class": "ChatOpenAI",
33
  "model_name": "gpt-4o",
 
34
  "description": "OpenAI GPT-4o",
35
  },
36
  "gpt-4o-mini": {
37
  "class": "ChatOpenAI",
38
  "model_name": "gpt-4o-mini",
39
+ "description": "OpenAI GPT-4o Mini",
 
40
  },
41
  "claude-3.5-sonnet": {
42
  "class": "ChatAnthropic",
43
  "model_name": "claude-3-5-sonnet-20240620",
 
44
  "description": "Anthropic Claude 3.5 Sonnet",
45
  },
46
  "gemini-1.5-pro": {
47
  "class": "ChatGoogleGenerativeAI",
48
  "model_name": "gemini-1.5-pro-latest",
 
49
  "description": "Google Gemini 1.5 Pro",
50
  },
51
+ "qwen2.5-vl-7b": {
 
 
 
 
 
 
52
  "class": "HuggingFaceChat",
53
+ "model_name": "Qwen/Qwen2.5-VL-7B-Instruct",
54
+ "description": "Qwen2.5-VL 7B Vision-Language",
 
55
  },
56
+ "qwen2.5-vl-3b": {
57
  "class": "HuggingFaceChat",
58
+ "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
59
+ "description": "Qwen2.5-VL 3B Vision-Language",
 
60
  },
61
  }
62
 
hf_chat.py CHANGED
@@ -21,9 +21,9 @@ class HuggingFaceChat(BaseChatModel):
21
  api_token: Optional[str] = Field(default=None, description="HF API token")
22
 
23
  def __init__(self, model: str, temperature: float = 0.0, **kwargs):
24
- api_token = kwargs.get("api_token") or os.getenv("HUGGINGFACE_API_KEY")
25
  if not api_token:
26
- raise ValueError("HUGGINGFACE_API_KEY environment variable is required")
27
 
28
  super().__init__(
29
  model=model, temperature=temperature, api_token=api_token, **kwargs
 
21
  api_token: Optional[str] = Field(default=None, description="HF API token")
22
 
23
  def __init__(self, model: str, temperature: float = 0.0, **kwargs):
24
+ api_token = kwargs.get("api_token") or os.getenv("HF_TOKEN")
25
  if not api_token:
26
+ raise ValueError("HF_TOKEN environment variable is required")
27
 
28
  super().__init__(
29
  model=model, temperature=temperature, api_token=api_token, **kwargs