import json from typing import Any, Literal from pydantic import BaseModel def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]: guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge." return [ { "role": "user", "content": [ { "type": "image", "image": image, }, {"type": "text", "text": f"{guidelines}\n{instruction}"}, ], } ] class ClickAction(BaseModel): """Click at specific coordinates on the screen.""" action: Literal["click"] = "click" x: int """The x coordinate, number of pixels from the left edge.""" y: int """The y coordinate, number of pixels from the top edge.""" def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]: guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format." return [ { "role": "system", "content": json.dumps([ClickAction.model_json_schema()]), }, { "role": "user", "content": [ { "type": "image", "image": image, }, {"type": "text", "text": f"{guidelines}\n{instruction}"}, ], }, ]