Hcompany
/

Holo1-3B

@@ -80,6 +80,10 @@ benchmark [WebClick](https://huggingface.co/datasets/Hcompany/WebClick).
 ## Get Started with the Model
 We provide starter code for the localization task: i.e. image + instruction -> click coordinates
 We also provide code to reproduce screenspot evaluations: screenspot_eval.py
@@ -151,109 +155,47 @@ resized_height, resized_width = smart_resize(
     max_pixels=image_processor.max_pixels,
 )
 image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
-instruction = "Select July 14th as the check-out date"
 ```
-### Localization as click(x, y)
 ```python
-def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
-    guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
-    return [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {"type": "text", "text": f"{guidelines}\n{instruction}"},
-            ],
-        }
-    ]
-messages = get_localization_prompt(image, instruction)
-coordinates_str = run_inference(messages)[0]
-print(coordinates_str)
-# Expected Click(352, 348)
 ```
-### Structured Output
-We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
 ```python
-from pydantic import BaseModel, ConfigDict
-class FunctionDefinition(BaseModel):
-    """Function definition data structure.
-    Attributes:
-        name: name of the function.
-        description: description of the function.
-        parameters: JSON schema for the function parameters.
-        strict: Whether to enable strict schema adherence when generating the function call.
-    """
-    name: str
-    description: str = ""
-    parameters: dict[str, Any] = {}
-    strict: bool = True
-class ClickAction(BaseModel):
-    """Click at specific coordinates on the screen."""
-    model_config = ConfigDict(
-        extra="forbid",
-        json_schema_serialization_defaults_required=True,
-        json_schema_mode_override="serialization",
-        use_attribute_docstrings=True,
-    )
-    action: Literal["click"] = "click"
-    x: int
-    """The x coordinate, number of pixels from the left edge."""
-    y: int
-    """The y coordinate, number of pixels from the top edge."""
-function_definition = FunctionDefinition(
-    name="click_action",
-    description=ClickAction.__doc__ or "",
-    parameters=ClickAction.model_json_schema(),
-    strict=True,
-)
-def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
-    guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
-    return [
-        {
-            "role": "system",
-            "content": json.dumps([function_definition.model_dump()]),
-        },
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {"type": "text", "text": f"{guidelines}\n{instruction}"},
-            ],
-        },
-    ]
-messages = get_localization_prompt_structured_output(image, instruction)
-coordinates_str = run_inference(messages)[0]
-coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
-print(coordinates)
 # Expected ClickAction(action='click', x=352, y=340)
 ```

 ## Get Started with the Model
+We provide 2 spaces to experiment with Localization and Navigation:
+ - https://huggingface.co/spaces/Hcompany/Holo1-Navigation
+ - https://huggingface.co/spaces/Hcompany/Holo1-Localization
 We provide starter code for the localization task: i.e. image + instruction -> click coordinates
 We also provide code to reproduce screenspot evaluations: screenspot_eval.py
     max_pixels=image_processor.max_pixels,
 )
 image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
 ```
+### Navigation with Structured Output
 ```python
+import json
+from . import navigation
+task = "Book a hotel in Paris on August 3rd for 3 nights"
+prompt = navigation.get_navigation_prompt(task, image, step=1)
+navigation_str = run_inference(prompt)[0]
+navigation = NavigationStep(**json.loads(navigation_str))
+print(navigation)
+# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
 ```
+### Localization with click(x, y)
 ```python
+from . import localization
+instruction = "Select July 14th as the check-out date"
+prompt = localization.get_localization_prompt(image, instruction)
+coordinates = run_inference(prompt)[0]
+print(coordinates)
+# Expected Click(352, 348)
+```
+### Localization with Structured Output
+We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
+```python
+import json
+from . import localization
+instruction = "Select July 14th as the check-out date"
+prompt = localization.get_localization_prompt_structured_output(image, instruction)
+coordinates_structured_str = run_inference(prompt)[0]
+coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
+print(coordinates_structured)
 # Expected ClickAction(action='click', x=352, y=340)
 ```

localization.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+from typing import Any, Literal
+from pydantic import BaseModel
+def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
+    guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": f"{guidelines}\n{instruction}"},
+            ],
+        }
+    ]
+class ClickAction(BaseModel):
+    """Click at specific coordinates on the screen."""
+    action: Literal["click"] = "click"
+    x: int
+    """The x coordinate, number of pixels from the left edge."""
+    y: int
+    """The y coordinate, number of pixels from the top edge."""
+def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
+    guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
+    return [
+        {
+            "role": "system",
+            "content": json.dumps([ClickAction.model_json_schema()]),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": f"{guidelines}\n{instruction}"},
+            ],
+        },
+    ]

navigation.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from typing import Literal
+from pydantic import BaseModel, Field
+SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
+In each iteration, you will receive an Observation that includes the last  screenshots of a web browser and the current memory of the agent.
+You have also information about the step that the agent is trying to achieve to solve the task.
+Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
+You should detail your thought (i.e. reasoning steps) before taking the action.
+Also detail in the notes field of the action the extracted information relevant to solve the task.
+Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
+This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
+Guidelines:
+- store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
+- Use both the task and the step information to decide what to do
+- if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
+- If there is a cookies notice, always accept all the cookies first
+- The observation is the screenshot of the current page and the memory of the agent.
+- If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
+- If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
+- If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
+- In the answer action, give as many details a possible relevant to answering the task.
+- if you want to write, don't click before. Directly use the write action
+- to write, identify the web element which is type and the text it already contains
+- If you want to use a search bar, directly write text in the search bar
+- Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
+- Don't scroll if you are at the end of the webpage
+- Only refresh if you identify a rate limit problem
+- If you are looking for a single flights, click on round-trip to select 'one way'
+- Never try to login, enter email or password. If there is a need to login, then go back.
+- If you are facing a captcha on a website, try to solve it.
+- if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
+- The current date is {timestamp}.
+# <output_json_format>
+# ```json
+# {output_format}
+# ```
+# </output_json_format>
+"""
+class ClickElementAction(BaseModel):
+    """Click at absolute coordinates of a web element with its description"""
+    action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
+    element: str = Field(description="text description of the element")
+    x: int = Field(description="The x coordinate, number of pixels from the left edge.")
+    y: int = Field(description="The y coordinate, number of pixels from the top edge.")
+    def log(self):
+        return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
+class WriteElementAction(BaseModel):
+    """Write content at absolute coordinates of a web element identified by its description, then press Enter."""
+    action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
+    content: str = Field(description="Content to write")
+    element: str = Field(description="Text description of the element")
+    x: int = Field(description="The x coordinate, number of pixels from the left edge.")
+    y: int = Field(description="The y coordinate, number of pixels from the top edge.")
+    def log(self):
+        return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
+class ScrollAction(BaseModel):
+    """Scroll action with no required element"""
+    action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
+    direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
+    def log(self):
+        return f"I have scrolled {self.direction}"
+class GoBackAction(BaseModel):
+    """Action to navigate back in browser history"""
+    action: Literal["go_back"] = Field(description="Navigate to the previous page")
+    def log(self):
+        return "I have gone back to the previous page"
+class RefreshAction(BaseModel):
+    """Action to refresh the current page"""
+    action: Literal["refresh"] = Field(description="Refresh the current page")
+    def log(self):
+        return "I have refreshed the page"
+class GotoAction(BaseModel):
+    """Action to go to a particular URL"""
+    action: Literal["goto"] = Field(description="Goto a particular URL")
+    url: str = Field(description="A url starting with http:// or https://")
+    def log(self):
+        return f"I have navigated to the URL {self.url}"
+class WaitAction(BaseModel):
+    """Action to wait for a particular amount of time"""
+    action: Literal["wait"] = Field(description="Wait for a particular amount of time")
+    seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
+    def log(self):
+        return f"I have waited for {self.seconds} seconds"
+class RestartAction(BaseModel):
+    """Restart the task from the beginning."""
+    action: Literal["restart"] = "restart"
+    def log(self):
+        return "I have restarted the task from the beginning"
+class AnswerAction(BaseModel):
+    """Return a final answer to the task. This is the last action to call in an episode."""
+    action: Literal["answer"] = "answer"
+    content: str = Field(description="The answer content")
+    def log(self):
+        return f"I have answered the task with '{self.content}'"
+ActionSpace = (
+    ClickElementAction
+    | WriteElementAction
+    | ScrollAction
+    | GoBackAction
+    | RefreshAction
+    | WaitAction
+    | RestartAction
+    | AnswerAction
+    | GotoAction
+)
+class NavigationStep(BaseModel):
+    note: str = Field(
+        default="",
+        description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
+    )
+    thought: str = Field(description="Reasoning about next steps (<4 lines)")
+    action: ActionSpace = Field(description="Next action to take")
+def get_navigation_prompt(task, image, step=1):
+    system_prompt = SYSTEM_PROMPT.format(
+        output_format=NavigationStep.model_json_schema(),
+        timestamp="2025-06-04 14:16:03",
+    )
+    return [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": system_prompt},
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"<task>\n{task}\n</task>\n"},
+                {"type": "text", "text": f"<observation step={step}>\n"},
+                {"type": "text", "text": "<screenshot>\n"},
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": "\n</screenshot>\n"},
+                {"type": "text", "text": "\n</observation>\n"},
+            ],
+        },
+    ]