Spaces:

Chrisyichuan
/

Omniscient

Building

App Files Files Community

Andy Lee commited on Jun 11

Commit

4d37e51

1 Parent(s): 23ee129

feat: more models, including qwen

Browse files

Files changed (4) hide show

app.py +104 -25
config.py +26 -0
geo_bot.py +32 -12
hf_chat.py +142 -0

app.py CHANGED Viewed

@@ -136,63 +136,138 @@ if start_button:
         # --- Inner agent exploration loop ---
         history = []
         final_guess = None
         for step in range(steps_per_sample):
             step_num = step + 1
             reasoning_placeholder.info(
-                f"Thinking... (Step {step_num}/{steps_per_sample})"
             )
             action_placeholder.empty()
             # Observe and label arrows
             bot.controller.label_arrows_on_screen()
             screenshot_bytes = bot.controller.take_street_view_screenshot()
             image_placeholder.image(
-                screenshot_bytes, caption=f"Step {step_num} View", use_column_width=True
             )
             # Update history
-            history.append(
-                {
-                    "image_b64": bot.pil_to_base64(
-                        Image.open(BytesIO(screenshot_bytes))
-                    ),
-                    "action": "N/A",
-                }
-            )
             # Think
             prompt = AGENT_PROMPT_TEMPLATE.format(
                 remaining_steps=steps_per_sample - step,
-                history_text="\n".join(
-                    [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history)]
-                ),
-                available_actions=json.dumps(bot.controller.get_available_actions()),
             )
             message = bot._create_message_with_history(
                 prompt, [h["image_b64"] for h in history]
             )
             response = bot.model.invoke(message)
             decision = bot._parse_agent_response(response)
             if not decision:  # Fallback
                 decision = {
                     "action_details": {"action": "PAN_RIGHT"},
-                    "reasoning": "Default recovery.",
                 }
             action = decision.get("action_details", {}).get("action")
             history[-1]["action"] = action
-            reasoning_placeholder.info(
-                f"**AI Reasoning:**\n\n{decision.get('reasoning', 'N/A')}"
             )
-            action_placeholder.success(f"**AI Action:** `{action}`")
             # Force a GUESS on the last step
             if step_num == steps_per_sample and action != "GUESS":
-                st.warning("Max steps reached. Forcing a GUESS action.")
                 action = "GUESS"
             # Act
             if action == "GUESS":
@@ -204,18 +279,22 @@ if start_button:
                     final_guess = (lat, lon)
                 else:
                     st.error(
-                        "GUESS action was missing coordinates. Guess failed for this sample."
                     )
                 break  # End exploration for the current sample
             elif action == "MOVE_FORWARD":
-                bot.controller.move("forward")
             elif action == "MOVE_BACKWARD":
-                bot.controller.move("backward")
             elif action == "PAN_LEFT":
-                bot.controller.pan_view("left")
             elif action == "PAN_RIGHT":
-                bot.controller.pan_view("right")
             time.sleep(1)  # A brief pause between steps for better visualization

         # --- Inner agent exploration loop ---
         history = []
         final_guess = None
+        step_history_container = st.container()
         for step in range(steps_per_sample):
             step_num = step + 1
             reasoning_placeholder.info(
+                f"🤔 Thinking... (Step {step_num}/{steps_per_sample})"
             )
             action_placeholder.empty()
             # Observe and label arrows
             bot.controller.label_arrows_on_screen()
             screenshot_bytes = bot.controller.take_street_view_screenshot()
+            # Current view
             image_placeholder.image(
+                screenshot_bytes,
+                caption=f"🔍 Step {step_num} - What AI Sees Now",
+                use_column_width=True,
             )
             # Update history
+            current_step_data = {
+                "image_b64": bot.pil_to_base64(Image.open(BytesIO(screenshot_bytes))),
+                "action": "N/A",
+                "screenshot_bytes": screenshot_bytes,
+                "step_num": step_num,
+            }
+            history.append(current_step_data)
             # Think
+            available_actions = bot.controller.get_available_actions()
+            history_text = "\n".join(
+                [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history[:-1])]
+            )
+            if not history_text:
+                history_text = "No history yet. This is the first step."
             prompt = AGENT_PROMPT_TEMPLATE.format(
                 remaining_steps=steps_per_sample - step,
+                history_text=history_text,
+                available_actions=json.dumps(available_actions),
             )
+            # Show what AI is considering
+            with reasoning_placeholder:
+                st.info("🧠 **AI is analyzing the situation...**")
+                with st.expander("🔍 Available Actions", expanded=False):
+                    st.json(available_actions)
+                with st.expander("📝 Context Being Considered", expanded=False):
+                    st.text_area(
+                        "History Context:", history_text, height=100, disabled=True
+                    )
             message = bot._create_message_with_history(
                 prompt, [h["image_b64"] for h in history]
             )
+            # Get AI response
             response = bot.model.invoke(message)
             decision = bot._parse_agent_response(response)
             if not decision:  # Fallback
                 decision = {
                     "action_details": {"action": "PAN_RIGHT"},
+                    "reasoning": "⚠️ Response parsing failed. Using default recovery action.",
                 }
             action = decision.get("action_details", {}).get("action")
             history[-1]["action"] = action
+            history[-1]["reasoning"] = decision.get("reasoning", "N/A")
+            history[-1]["raw_response"] = (
+                response.content[:500] + "..."
+                if len(response.content) > 500
+                else response.content
             )
+            # Display AI's decision process
+            reasoning_placeholder.success("✅ **AI Decision Made!**")
+            with action_placeholder:
+                st.success(f"🎯 **AI Action:** `{action}`")
+                # Detailed reasoning display
+                with st.expander("🧠 AI's Detailed Thinking Process", expanded=True):
+                    col_reason, col_raw = st.columns([2, 1])
+                    with col_reason:
+                        st.markdown("**🤔 AI's Reasoning:**")
+                        st.info(decision.get("reasoning", "N/A"))
+                        if action == "GUESS":
+                            lat = decision.get("action_details", {}).get("lat")
+                            lon = decision.get("action_details", {}).get("lon")
+                            if lat and lon:
+                                st.success(f"📍 **Final Guess:** {lat:.4f}, {lon:.4f}")
+                    with col_raw:
+                        st.markdown("**🔤 Raw AI Response:**")
+                        st.text_area(
+                            "Full Response:",
+                            history[-1]["raw_response"],
+                            height=200,
+                            disabled=True,
+                            key=f"raw_response_{step_num}",
+                        )
+            # Store step in history display
+            with step_history_container:
+                with st.expander(f"📚 Step {step_num} History", expanded=False):
+                    hist_col1, hist_col2 = st.columns([1, 2])
+                    with hist_col1:
+                        st.image(
+                            screenshot_bytes, caption=f"Step {step_num} View", width=200
+                        )
+                    with hist_col2:
+                        st.write(f"**Action:** {action}")
+                        st.write(
+                            f"**Reasoning:** {decision.get('reasoning', 'N/A')[:150]}..."
+                        )
             # Force a GUESS on the last step
             if step_num == steps_per_sample and action != "GUESS":
+                st.warning("⏰ Max steps reached. Forcing a GUESS action.")
                 action = "GUESS"
+                # Force coordinates if missing
+                if not decision.get("action_details", {}).get("lat"):
+                    st.error("❌ AI didn't provide coordinates. Using fallback guess.")
+                    decision["action_details"] = {
+                        "action": "GUESS",
+                        "lat": 0.0,
+                        "lon": 0.0,
+                    }
             # Act
             if action == "GUESS":
                     final_guess = (lat, lon)
                 else:
                     st.error(
+                        "❌ GUESS action was missing coordinates. Guess failed for this sample."
                     )
                 break  # End exploration for the current sample
             elif action == "MOVE_FORWARD":
+                with st.spinner("🚶 Moving forward..."):
+                    bot.controller.move("forward")
             elif action == "MOVE_BACKWARD":
+                with st.spinner("🔄 Moving backward..."):
+                    bot.controller.move("backward")
             elif action == "PAN_LEFT":
+                with st.spinner("⬅️ Panning left..."):
+                    bot.controller.pan_view("left")
             elif action == "PAN_RIGHT":
+                with st.spinner("➡️ Panning right..."):
+                    bot.controller.pan_view("right")
             time.sleep(1)  # A brief pause between steps for better visualization

config.py CHANGED Viewed

@@ -31,18 +31,44 @@ MODELS_CONFIG = {
     "gpt-4o": {
         "class": "ChatOpenAI",
         "model_name": "gpt-4o",
     },
     "claude-3.5-sonnet": {
         "class": "ChatAnthropic",
         "model_name": "claude-3-5-sonnet-20240620",
     },
     "gemini-1.5-pro": {
         "class": "ChatGoogleGenerativeAI",
         "model_name": "gemini-1.5-pro-latest",
     },
     "gemini-2.5-pro": {
         "class": "ChatGoogleGenerativeAI",
         "model_name": "gemini-2.5-pro-preview-06-05",
     },
 }

     "gpt-4o": {
         "class": "ChatOpenAI",
         "model_name": "gpt-4o",
+        "api_key_env": "OPENAI_API_KEY",
+        "description": "OpenAI GPT-4o",
+    },
+    "gpt-4o-mini": {
+        "class": "ChatOpenAI",
+        "model_name": "gpt-4o-mini",
+        "api_key_env": "OPENAI_API_KEY",
+        "description": "OpenAI GPT-4o Mini (cheaper)",
     },
     "claude-3.5-sonnet": {
         "class": "ChatAnthropic",
         "model_name": "claude-3-5-sonnet-20240620",
+        "api_key_env": "ANTHROPIC_API_KEY",
+        "description": "Anthropic Claude 3.5 Sonnet",
     },
     "gemini-1.5-pro": {
         "class": "ChatGoogleGenerativeAI",
         "model_name": "gemini-1.5-pro-latest",
+        "api_key_env": "GOOGLE_API_KEY",
+        "description": "Google Gemini 1.5 Pro",
     },
     "gemini-2.5-pro": {
         "class": "ChatGoogleGenerativeAI",
         "model_name": "gemini-2.5-pro-preview-06-05",
+        "api_key_env": "GOOGLE_API_KEY",
+        "description": "Google Gemini 2.5 Pro",
+    },
+    "qwen2-vl-72b": {
+        "class": "HuggingFaceChat",
+        "model_name": "Qwen/Qwen2-VL-72B-Instruct",
+        "api_key_env": "HUGGINGFACE_API_KEY",
+        "description": "Qwen2-VL 72B (via HF Inference API)",
+    },
+    "qwen2-vl-7b": {
+        "class": "HuggingFaceChat",
+        "model_name": "Qwen/Qwen2-VL-7B-Instruct",
+        "api_key_env": "HUGGINGFACE_API_KEY",
+        "description": "Qwen2-VL 7B (via HF Inference API)",
     },
 }

geo_bot.py CHANGED Viewed

@@ -11,10 +11,11 @@ from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
 from mapcrunch_controller import MapCrunchController
 # The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles.
 AGENT_PROMPT_TEMPLATE = """
 **Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.
@@ -68,11 +69,20 @@ class GeoBot:
     ):
         # Initialize model with temperature parameter
         model_kwargs = {
-            "model": model_name,
             "temperature": temperature,
         }
-        self.model = model(**model_kwargs)
         self.model_name = model_name
         self.temperature = temperature
         self.use_selenium = use_selenium
@@ -90,6 +100,7 @@ class GeoBot:
     ) -> List[HumanMessage]:
         """Creates a message for the LLM that includes text and a sequence of images."""
         content = [{"type": "text", "text": prompt}]
         # Add the JSON format instructions right after the main prompt text
         content.append(
             {
@@ -145,7 +156,6 @@ class GeoBot:
             print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
             self.controller.setup_clean_environment()
             self.controller.label_arrows_on_screen()
             screenshot_bytes = self.controller.take_street_view_screenshot()
@@ -178,17 +188,22 @@ class GeoBot:
                 available_actions=json.dumps(available_actions),
             )
-            message = self._create_message_with_history(prompt, image_b64_for_prompt)
-            response = self.model.invoke(message)
-            decision = self._parse_agent_response(response)
             if not decision:
                 print(
-                    "Response parsing failed. Using default recovery action: PAN_RIGHT."
                 )
                 decision = {
-                    "reasoning": "Recovery due to parsing failure.",
                     "action_details": {"action": "PAN_RIGHT"},
                 }
@@ -219,8 +234,13 @@ class GeoBot:
     def analyze_image(self, image: Image.Image) -> Optional[Tuple[float, float]]:
         image_b64 = self.pil_to_base64(image)
         message = self._create_llm_message(BENCHMARK_PROMPT, image_b64)
-        response = self.model.invoke(message)
-        print(f"\nLLM Response:\n{response.content}")
         content = response.content.strip()
         last_line = ""

 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
+from hf_chat import HuggingFaceChat
 from mapcrunch_controller import MapCrunchController
 # The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles.
 AGENT_PROMPT_TEMPLATE = """
 **Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.
     ):
         # Initialize model with temperature parameter
         model_kwargs = {
             "temperature": temperature,
         }
+        # Handle different model types
+        if model == HuggingFaceChat and HuggingFaceChat is not None:
+            model_kwargs["model"] = model_name
+        else:
+            model_kwargs["model"] = model_name
+        try:
+            self.model = model(**model_kwargs)
+        except Exception as e:
+            raise ValueError(f"Failed to initialize model {model_name}: {e}")
         self.model_name = model_name
         self.temperature = temperature
         self.use_selenium = use_selenium
     ) -> List[HumanMessage]:
         """Creates a message for the LLM that includes text and a sequence of images."""
         content = [{"type": "text", "text": prompt}]
         # Add the JSON format instructions right after the main prompt text
         content.append(
             {
             print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
             self.controller.setup_clean_environment()
             self.controller.label_arrows_on_screen()
             screenshot_bytes = self.controller.take_street_view_screenshot()
                 available_actions=json.dumps(available_actions),
             )
+            try:
+                message = self._create_message_with_history(
+                    prompt, image_b64_for_prompt
+                )
+                response = self.model.invoke(message)
+                decision = self._parse_agent_response(response)
+            except Exception as e:
+                print(f"Error during model invocation: {e}")
+                decision = None
             if not decision:
                 print(
+                    "Response parsing failed or model error. Using default recovery action: PAN_RIGHT."
                 )
                 decision = {
+                    "reasoning": "Recovery due to parsing failure or model error.",
                     "action_details": {"action": "PAN_RIGHT"},
                 }
     def analyze_image(self, image: Image.Image) -> Optional[Tuple[float, float]]:
         image_b64 = self.pil_to_base64(image)
         message = self._create_llm_message(BENCHMARK_PROMPT, image_b64)
+        try:
+            response = self.model.invoke(message)
+            print(f"\nLLM Response:\n{response.content}")
+        except Exception as e:
+            print(f"Error during image analysis: {e}")
+            return None
         content = response.content.strip()
         last_line = ""

hf_chat.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+HuggingFace Chat Model Wrapper for vision models like Qwen2-VL
+"""
+import os
+import base64
+import requests
+from typing import List, Dict, Any, Optional
+from langchain_core.messages import BaseMessage, HumanMessage
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.outputs import ChatResult, ChatGeneration
+from pydantic import Field
+class HuggingFaceChat(BaseChatModel):
+    """Chat model wrapper for HuggingFace Inference API"""
+    model: str = Field(description="HuggingFace model name")
+    temperature: float = Field(default=0.0, description="Temperature for sampling")
+    max_tokens: int = Field(default=1000, description="Max tokens to generate")
+    api_token: Optional[str] = Field(default=None, description="HF API token")
+    def __init__(self, model: str, temperature: float = 0.0, **kwargs):
+        api_token = kwargs.get("api_token") or os.getenv("HUGGINGFACE_API_KEY")
+        if not api_token:
+            raise ValueError("HUGGINGFACE_API_KEY environment variable is required")
+        super().__init__(
+            model=model, temperature=temperature, api_token=api_token, **kwargs
+        )
+    @property
+    def _llm_type(self) -> str:
+        return "huggingface_chat"
+    def _format_message_for_hf(self, message: HumanMessage) -> Dict[str, Any]:
+        """Convert LangChain message to HuggingFace format"""
+        if isinstance(message.content, str):
+            return {"role": "user", "content": message.content}
+        # Handle multi-modal content (text + images)
+        formatted_content = []
+        for item in message.content:
+            if item["type"] == "text":
+                formatted_content.append({"type": "text", "text": item["text"]})
+            elif item["type"] == "image_url":
+                # Extract base64 data from data URL
+                image_url = item["image_url"]["url"]
+                if image_url.startswith("data:image"):
+                    # Extract base64 data
+                    base64_data = image_url.split(",")[1]
+                    formatted_content.append({"type": "image", "image": base64_data})
+        return {"role": "user", "content": formatted_content}
+    def _generate(self, messages: List[BaseMessage], **kwargs) -> ChatResult:
+        """Generate response using HuggingFace Inference API"""
+        # Format messages for HF API
+        formatted_messages = []
+        for msg in messages:
+            if isinstance(msg, HumanMessage):
+                formatted_messages.append(self._format_message_for_hf(msg))
+        # Prepare API request
+        api_url = f"https://api-inference.huggingface.co/models/{self.model}/v1/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.api_token}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "model": self.model,
+            "messages": formatted_messages,
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+            "stream": False,
+        }
+        try:
+            response = requests.post(api_url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()
+            content = result["choices"][0]["message"]["content"]
+            return ChatResult(
+                generations=[ChatGeneration(message=HumanMessage(content=content))]
+            )
+        except requests.exceptions.RequestException as e:
+            # Fallback to simple text-only API if chat completions fail
+            return self._fallback_generate(messages, **kwargs)
+    def _fallback_generate(self, messages: List[BaseMessage], **kwargs) -> ChatResult:
+        """Fallback to simple HF Inference API"""
+        try:
+            # Use simple inference API as fallback
+            api_url = f"https://api-inference.huggingface.co/models/{self.model}"
+            headers = {
+                "Authorization": f"Bearer {self.api_token}",
+                "Content-Type": "application/json",
+            }
+            # Extract text content only for fallback
+            text_content = ""
+            for msg in messages:
+                if isinstance(msg, HumanMessage):
+                    if isinstance(msg.content, str):
+                        text_content += msg.content
+                    else:
+                        for item in msg.content:
+                            if item["type"] == "text":
+                                text_content += item["text"] + "\n"
+            payload = {
+                "inputs": text_content,
+                "parameters": {
+                    "temperature": self.temperature,
+                    "max_new_tokens": self.max_tokens,
+                },
+            }
+            response = requests.post(api_url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()
+            if isinstance(result, list) and len(result) > 0:
+                content = result[0].get("generated_text", "No response generated")
+            else:
+                content = "Error: Invalid response format"
+            return ChatResult(
+                generations=[ChatGeneration(message=HumanMessage(content=content))]
+            )
+        except Exception as e:
+            # Last resort fallback
+            error_msg = f"HuggingFace API Error: {str(e)}. Please check your API key and model availability."
+            return ChatResult(
+                generations=[ChatGeneration(message=HumanMessage(content=error_msg))]
+            )