samihalawa commited on
Commit
63f39a2
·
verified ·
1 Parent(s): bc67178

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data
2
+ tmp
.env.example ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_ENDPOINT=https://api.openai.com/v1
2
+ OPENAI_API_KEY="sk-proj-hQFpY04IYat31CTUGUbtT3BlbkFJVu40kflj9atK0WkP1dNx"
3
+
4
+ ANTHROPIC_API_KEY=
5
+ ANTHROPIC_ENDPOINT=https://api.anthropic.com
6
+
7
+ GOOGLE_API_KEY="AIzaSyDRcmawVRBc9rVFEjNc4FeCt_5e8VP72GI"
8
+
9
+ AZURE_OPENAI_ENDPOINT=
10
+ AZURE_OPENAI_API_KEY=
11
+ AZURE_OPENAI_API_VERSION=2025-01-01-preview
12
+
13
+ DEEPSEEK_ENDPOINT=https://api.deepseek.com
14
+ DEEPSEEK_API_KEY=
15
+
16
+ MISTRAL_API_KEY=dRUVfjNRlng47TF9YQp2dupabBRLQrae
17
+ MISTRAL_ENDPOINT=https://api.mistral.ai/v1
18
+
19
+ OLLAMA_ENDPOINT=http://localhost:11434
20
+
21
+ ALIBABA_ENDPOINT=https://dashscope.aliyuncs.com/compatible-mode/v1
22
+ ALIBABA_API_KEY=
23
+
24
+ MOONSHOT_ENDPOINT=https://api.moonshot.cn/v1
25
+ MOONSHOT_API_KEY=
26
+
27
+ # Set to false to disable anonymized telemetry
28
+ ANONYMIZED_TELEMETRY=true
29
+
30
+ # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
31
+ BROWSER_USE_LOGGING_LEVEL=info
32
+
33
+ # Chrome settings
34
+ CHRCHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
35
+ CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
36
+ CHROME_DEBUGGING_PORT=9222
37
+ CHROME_DEBUGGING_HOST=localhost
38
+ # Set to true to keep browser open between AI tasks
39
+ CHROME_PERSISTENT_SESSION=true
40
+
41
+ # Display settings
42
+ # Format: WIDTHxHEIGHTxDEPTH
43
+ RESOLUTION=1920x1080x24
44
+ # Width in pixels
45
+ RESOLUTION_WIDTH=1920
46
+ # Height in pixels
47
+ RESOLUTION_HEIGHT=1080
48
+
49
+ # VNC settings http://localhost:6080/vnc.html
50
+ VNC_PASSWORD=659777908
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/examples/test.png filter=lfs diff=lfs merge=lfs -text
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+ test_env/
133
+ myenv
134
+
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ .idea/
166
+ temp
167
+ tmp
168
+
169
+
170
+ .DS_Store
171
+
172
+ private_example.py
173
+ private_example
174
+
175
+ browser_cookies.json
176
+ cookies.json
177
+ AgentHistory.json
178
+ cv_04_24.pdf
179
+ AgentHistoryList.json
180
+ *.gif
181
+
182
+ # For Sharing (.pem files)
183
+ .gradio/
184
+
185
+ # For Docker
186
+ data/
187
+
188
+ # For Config Files (Current Settings)
189
+ .config.pkl
.vscode/settings.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.analysis.typeCheckingMode": "basic",
3
+ "[python]": {
4
+ "editor.defaultFormatter": "charliermarsh.ruff",
5
+ "editor.formatOnSave": true,
6
+ "editor.codeActionsOnSave": {
7
+ "source.fixAll.ruff": "explicit",
8
+ "source.organizeImports.ruff": "explicit"
9
+ }
10
+ }
11
+ }
Dockerfile ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ wget \
6
+ netcat-traditional \
7
+ gnupg \
8
+ curl \
9
+ unzip \
10
+ xvfb \
11
+ libgconf-2-4 \
12
+ libxss1 \
13
+ libnss3 \
14
+ libnspr4 \
15
+ libasound2 \
16
+ libatk1.0-0 \
17
+ libatk-bridge2.0-0 \
18
+ libcups2 \
19
+ libdbus-1-3 \
20
+ libdrm2 \
21
+ libgbm1 \
22
+ libgtk-3-0 \
23
+ libxcomposite1 \
24
+ libxdamage1 \
25
+ libxfixes3 \
26
+ libxrandr2 \
27
+ xdg-utils \
28
+ fonts-liberation \
29
+ dbus \
30
+ xauth \
31
+ xvfb \
32
+ x11vnc \
33
+ tigervnc-tools \
34
+ supervisor \
35
+ net-tools \
36
+ procps \
37
+ git \
38
+ python3-numpy \
39
+ fontconfig \
40
+ fonts-dejavu \
41
+ fonts-dejavu-core \
42
+ fonts-dejavu-extra \
43
+ && rm -rf /var/lib/apt/lists/*
44
+
45
+ # Install noVNC
46
+ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
47
+ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
48
+ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
49
+
50
+ # Set platform for ARM64 compatibility
51
+ ARG TARGETPLATFORM=linux/amd64
52
+
53
+ # Set up working directory
54
+ WORKDIR /app
55
+
56
+ # Copy requirements and install Python dependencies
57
+ COPY requirements.txt .
58
+ RUN pip install --no-cache-dir -r requirements.txt
59
+
60
+ # Install Playwright and browsers with system dependencies
61
+ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
62
+ RUN playwright install --with-deps chromium
63
+ RUN playwright install-deps
64
+
65
+ # Copy the application code
66
+ COPY . .
67
+
68
+ # Set environment variables
69
+ ENV PYTHONUNBUFFERED=1
70
+ ENV BROWSER_USE_LOGGING_LEVEL=info
71
+ ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
72
+ ENV ANONYMIZED_TELEMETRY=false
73
+ ENV DISPLAY=:99
74
+ ENV RESOLUTION=1920x1080x24
75
+ ENV VNC_PASSWORD=vncpassword
76
+ ENV CHROME_PERSISTENT_SESSION=true
77
+ ENV RESOLUTION_WIDTH=1920
78
+ ENV RESOLUTION_HEIGHT=1080
79
+
80
+ # Set up supervisor configuration
81
+ RUN mkdir -p /var/log/supervisor
82
+ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
83
+
84
+ EXPOSE 7788 6080 5901
85
+
86
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
Dockerfile.arm64 ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ wget \
6
+ gnupg \
7
+ curl \
8
+ unzip \
9
+ xvfb \
10
+ libgconf-2-4 \
11
+ libxss1 \
12
+ libnss3 \
13
+ libnspr4 \
14
+ libasound2 \
15
+ libatk1.0-0 \
16
+ libatk-bridge2.0-0 \
17
+ libcups2 \
18
+ libdbus-1-3 \
19
+ libdrm2 \
20
+ libgbm1 \
21
+ libgtk-3-0 \
22
+ libxcomposite1 \
23
+ libxdamage1 \
24
+ libxfixes3 \
25
+ libxrandr2 \
26
+ xdg-utils \
27
+ fonts-liberation \
28
+ dbus \
29
+ xauth \
30
+ xvfb \
31
+ x11vnc \
32
+ tigervnc-tools \
33
+ supervisor \
34
+ net-tools \
35
+ procps \
36
+ git \
37
+ python3-numpy \
38
+ fontconfig \
39
+ fonts-dejavu \
40
+ fonts-dejavu-core \
41
+ fonts-dejavu-extra \
42
+ && rm -rf /var/lib/apt/lists/*
43
+
44
+ # Install noVNC
45
+ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
46
+ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
47
+ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
48
+
49
+ # Set platform explicitly for ARM64
50
+ ARG TARGETPLATFORM=linux/arm64
51
+
52
+ # Set up working directory
53
+ WORKDIR /app
54
+
55
+ # Copy requirements and install Python dependencies
56
+ COPY requirements.txt .
57
+ RUN pip install --no-cache-dir -r requirements.txt
58
+
59
+ # Install Playwright and browsers with system dependencies optimized for ARM64
60
+ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
61
+ RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \
62
+ playwright install --with-deps chromium
63
+
64
+ # Copy the application code
65
+ COPY . .
66
+
67
+ # Set environment variables
68
+ ENV PYTHONUNBUFFERED=1
69
+ ENV BROWSER_USE_LOGGING_LEVEL=info
70
+ ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
71
+ ENV ANONYMIZED_TELEMETRY=false
72
+ ENV DISPLAY=:99
73
+ ENV RESOLUTION=1920x1080x24
74
+ ENV VNC_PASSWORD=vncpassword
75
+ ENV CHROME_PERSISTENT_SESSION=true
76
+ ENV RESOLUTION_WIDTH=1920
77
+ ENV RESOLUTION_HEIGHT=1080
78
+
79
+ # Set up supervisor configuration
80
+ RUN mkdir -p /var/log/supervisor
81
+ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
82
+
83
+ EXPOSE 7788 6080 5901
84
+
85
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Browser Use Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,233 @@
1
  ---
2
- title: Web Ui
3
- emoji: 🦀
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: web-ui
3
+ app_file: webui.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.23.1
 
 
6
  ---
7
+ <img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/>
8
 
9
+ <br/>
10
+
11
+ [![GitHub stars](https://img.shields.io/github/stars/browser-use/web-ui?style=social)](https://github.com/browser-use/web-ui/stargazers)
12
+ [![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord)
13
+ [![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com)
14
+ [![WarmShao](https://img.shields.io/twitter/follow/warmshao?style=social)](https://x.com/warmshao)
15
+
16
+ This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents.
17
+
18
+ We would like to officially thank [WarmShao](https://github.com/warmshao) for his contribution to this project.
19
+
20
+ **WebUI:** is built on Gradio and supports most of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
21
+
22
+ **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Google, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future.
23
+
24
+ **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
25
+
26
+ **Persistent Browser Sessions:** You can choose to keep the browser window open between AI tasks, allowing you to see the complete history and state of AI interactions.
27
+
28
+ <video src="https://github.com/user-attachments/assets/56bc7080-f2e3-4367-af22-6bf2245ff6cb" controls="controls">Your browser does not support playing this video!</video>
29
+
30
+ ## Installation Guide
31
+
32
+ ### Prerequisites
33
+ - Python 3.11 or higher
34
+ - Git (for cloning the repository)
35
+
36
+ ### Option 1: Local Installation
37
+
38
+ Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
39
+
40
+ #### Step 1: Clone the Repository
41
+ ```bash
42
+ git clone https://github.com/browser-use/web-ui.git
43
+ cd web-ui
44
+ ```
45
+
46
+ #### Step 2: Set Up Python Environment
47
+ We recommend using [uv](https://docs.astral.sh/uv/) for managing the Python environment.
48
+
49
+ Using uv (recommended):
50
+ ```bash
51
+ uv venv --python 3.11
52
+ ```
53
+
54
+ Activate the virtual environment:
55
+ - Windows (Command Prompt):
56
+ ```cmd
57
+ .venv\Scripts\activate
58
+ ```
59
+ - Windows (PowerShell):
60
+ ```powershell
61
+ .\.venv\Scripts\Activate.ps1
62
+ ```
63
+ - macOS/Linux:
64
+ ```bash
65
+ source .venv/bin/activate
66
+ ```
67
+
68
+ #### Step 3: Install Dependencies
69
+ Install Python packages:
70
+ ```bash
71
+ uv pip install -r requirements.txt
72
+ ```
73
+
74
+ Install Playwright:
75
+ ```bash
76
+ playwright install
77
+ ```
78
+
79
+ #### Step 4: Configure Environment
80
+ 1. Create a copy of the example environment file:
81
+ - Windows (Command Prompt):
82
+ ```bash
83
+ copy .env.example .env
84
+ ```
85
+ - macOS/Linux/Windows (PowerShell):
86
+ ```bash
87
+ cp .env.example .env
88
+ ```
89
+ 2. Open `.env` in your preferred text editor and add your API keys and other settings
90
+
91
+ ### Option 2: Docker Installation
92
+
93
+ #### Prerequisites
94
+ - Docker and Docker Compose installed
95
+ - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
96
+ - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)
97
+
98
+ #### Installation Steps
99
+ 1. Clone the repository:
100
+ ```bash
101
+ git clone https://github.com/browser-use/web-ui.git
102
+ cd web-ui
103
+ ```
104
+
105
+ 2. Create and configure environment file:
106
+ - Windows (Command Prompt):
107
+ ```bash
108
+ copy .env.example .env
109
+ ```
110
+ - macOS/Linux/Windows (PowerShell):
111
+ ```bash
112
+ cp .env.example .env
113
+ ```
114
+ Edit `.env` with your preferred text editor and add your API keys
115
+
116
+ 3. Run with Docker:
117
+ ```bash
118
+ # Build and start the container with default settings (browser closes after AI tasks)
119
+ docker compose up --build
120
+ ```
121
+ ```bash
122
+ # Or run with persistent browser (browser stays open between AI tasks)
123
+ CHROME_PERSISTENT_SESSION=true docker compose up --build
124
+ ```
125
+
126
+
127
+ 4. Access the Application:
128
+ - Web Interface: Open `http://localhost:7788` in your browser
129
+ - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
130
+ - Default VNC password: "youvncpassword"
131
+ - Can be changed by setting `VNC_PASSWORD` in your `.env` file
132
+
133
+ ## Usage
134
+
135
+ ### Local Setup
136
+ 1. **Run the WebUI:**
137
+ After completing the installation steps above, start the application:
138
+ ```bash
139
+ python webui.py --ip 127.0.0.1 --port 7788
140
+ ```
141
+ 2. WebUI options:
142
+ - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
143
+ - `--port`: The port to bind the WebUI to. Default is `7788`.
144
+ - `--theme`: The theme for the user interface. Default is `Ocean`.
145
+ - **Default**: The standard theme with a balanced design.
146
+ - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
147
+ - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
148
+ - **Glass**: A sleek, semi-transparent design for a modern appearance.
149
+ - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
150
+ - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
151
+ - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
152
+ - `--dark-mode`: Enables dark mode for the user interface.
153
+ 3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
154
+ 4. **Using Your Own Browser(Optional):**
155
+ - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
156
+ - Windows
157
+ ```env
158
+ CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
159
+ CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
160
+ ```
161
+ > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
162
+ - Mac
163
+ ```env
164
+ CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
165
+ CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
166
+ ```
167
+ - Close all Chrome windows
168
+ - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
169
+ - Check the "Use Own Browser" option within the Browser Settings.
170
+ 5. **Keep Browser Open(Optional):**
171
+ - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
172
+
173
+ ### Docker Setup
174
+ 1. **Environment Variables:**
175
+ - All configuration is done through the `.env` file
176
+ - Available environment variables:
177
+ ```
178
+ # LLM API Keys
179
+ OPENAI_API_KEY=your_key_here
180
+ ANTHROPIC_API_KEY=your_key_here
181
+ GOOGLE_API_KEY=your_key_here
182
+
183
+ # Browser Settings
184
+ CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks
185
+ RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH
186
+ RESOLUTION_WIDTH=1920 # Custom width in pixels
187
+ RESOLUTION_HEIGHT=1080 # Custom height in pixels
188
+
189
+ # VNC Settings
190
+ VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword"
191
+ ```
192
+
193
+ 2. **Platform Support:**
194
+ - Supports both AMD64 and ARM64 architectures
195
+ - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
196
+
197
+ 3. **Browser Persistence Modes:**
198
+ - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
199
+ - Browser opens and closes with each AI task
200
+ - Clean state for each interaction
201
+ - Lower resource usage
202
+
203
+ - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
204
+ - Browser stays open between AI tasks
205
+ - Maintains history and state
206
+ - Allows viewing previous AI interactions
207
+ - Set in `.env` file or via environment variable when starting container
208
+
209
+ 4. **Viewing Browser Interactions:**
210
+ - Access the noVNC viewer at `http://localhost:6080/vnc.html`
211
+ - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
212
+ - Direct VNC access available on port 5900 (mapped to container port 5901)
213
+ - You can now see all browser interactions in real-time
214
+
215
+ 5. **Container Management:**
216
+ ```bash
217
+ # Start with persistent browser
218
+ CHROME_PERSISTENT_SESSION=true docker compose up -d
219
+
220
+ # Start with default mode (browser closes after tasks)
221
+ docker compose up -d
222
+
223
+ # View logs
224
+ docker compose logs -f
225
+
226
+ # Stop the container
227
+ docker compose down
228
+ ```
229
+
230
+ ## Changelog
231
+ - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
232
+ - [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
233
+ - [x] **2025/01/06:** Thanks to @richard-devbot. A New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113).
SECURITY.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Reporting Security Issues
2
+
3
+ If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
4
+
5
+ **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
6
+
7
+ Instead, please open a new [Github security advisory](https://github.com/browser-use/web-ui/security/advisories/new).
8
+
9
+ Please include as much of the information listed below as you can to help me better understand and resolve the issue:
10
+
11
+ * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
12
+ * Full paths of source file(s) related to the manifestation of the issue
13
+ * The location of the affected source code (tag/branch/commit or direct URL)
14
+ * Any special configuration required to reproduce the issue
15
+ * Step-by-step instructions to reproduce the issue
16
+ * Proof-of-concept or exploit code (if possible)
17
+ * Impact of the issue, including how an attacker might exploit the issue
18
+
19
+ This information will help me triage your report more quickly.
assets/examples/test.png ADDED

Git LFS Details

  • SHA256: 23e4fe8c9836cd35393315a3cca074dbd55a8645289ea337e3300269dda06900
  • Pointer size: 131 Bytes
  • Size of remote file: 423 kB
assets/web-ui.png ADDED
docker-compose.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ browser-use-webui:
3
+ platform: linux/amd64
4
+ build:
5
+ context: .
6
+ dockerfile: ${DOCKERFILE:-Dockerfile}
7
+ args:
8
+ TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
9
+ ports:
10
+ - "7788:7788" # Gradio default port
11
+ - "6080:6080" # noVNC web interface
12
+ - "5901:5901" # VNC port
13
+ - "9222:9222" # Chrome remote debugging port
14
+ environment:
15
+ - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
16
+ - OPENAI_API_KEY=${OPENAI_API_KEY:-}
17
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
18
+ - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
19
+ - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
20
+ - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
21
+ - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
22
+ - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
23
+ - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
24
+ - ANONYMIZED_TELEMETRY=false
25
+ - CHROME_PATH=/usr/bin/google-chrome
26
+ - CHROME_USER_DATA=/app/data/chrome_data
27
+ - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
28
+ - DISPLAY=:99
29
+ - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
30
+ - RESOLUTION=${RESOLUTION:-1920x1080x24}
31
+ - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
32
+ - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
33
+ - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
34
+ - CHROME_DEBUGGING_PORT=9222
35
+ - CHROME_DEBUGGING_HOST=localhost
36
+ volumes:
37
+ - /tmp/.X11-unix:/tmp/.X11-unix
38
+ restart: unless-stopped
39
+ shm_size: '2gb'
40
+ cap_add:
41
+ - SYS_ADMIN
42
+ security_opt:
43
+ - seccomp=unconfined
44
+ tmpfs:
45
+ - /tmp
46
+ healthcheck:
47
+ test: ["CMD", "nc", "-z", "localhost", "5901"]
48
+ interval: 10s
49
+ timeout: 5s
50
+ retries: 3
entrypoint.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Start supervisord in the foreground to properly manage child processes
4
+ exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ browser-use==0.1.37
2
+ pyperclip==1.9.0
3
+ gradio==5.10.0
4
+ json-repair
5
+ langchain-mistralai==0.2.4
src/__init__.py ADDED
File without changes
src/agent/__init__.py ADDED
File without changes
src/agent/custom_agent.py ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import pdb
4
+ import traceback
5
+ from typing import Optional, Type, List, Dict, Any, Callable
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import os
8
+ import base64
9
+ import io
10
+ import platform
11
+ from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
12
+ from browser_use.agent.service import Agent
13
+ from browser_use.agent.views import (
14
+ ActionResult,
15
+ ActionModel,
16
+ AgentHistoryList,
17
+ AgentOutput,
18
+ AgentHistory,
19
+ )
20
+ from browser_use.browser.browser import Browser
21
+ from browser_use.browser.context import BrowserContext
22
+ from browser_use.browser.views import BrowserStateHistory
23
+ from browser_use.controller.service import Controller
24
+ from browser_use.telemetry.views import (
25
+ AgentEndTelemetryEvent,
26
+ AgentRunTelemetryEvent,
27
+ AgentStepTelemetryEvent,
28
+ )
29
+ from browser_use.utils import time_execution_async
30
+ from langchain_core.language_models.chat_models import BaseChatModel
31
+ from langchain_core.messages import (
32
+ BaseMessage,
33
+ HumanMessage,
34
+ AIMessage
35
+ )
36
+ from browser_use.agent.prompts import PlannerPrompt
37
+
38
+ from json_repair import repair_json
39
+ from src.utils.agent_state import AgentState
40
+
41
+ from .custom_message_manager import CustomMessageManager
42
+ from .custom_views import CustomAgentOutput, CustomAgentStepInfo
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ class CustomAgent(Agent):
48
+ def __init__(
49
+ self,
50
+ task: str,
51
+ llm: BaseChatModel,
52
+ add_infos: str = "",
53
+ browser: Browser | None = None,
54
+ browser_context: BrowserContext | None = None,
55
+ controller: Controller = Controller(),
56
+ use_vision: bool = True,
57
+ use_vision_for_planner: bool = False,
58
+ save_conversation_path: Optional[str] = None,
59
+ save_conversation_path_encoding: Optional[str] = 'utf-8',
60
+ max_failures: int = 3,
61
+ retry_delay: int = 10,
62
+ system_prompt_class: Type[SystemPrompt] = SystemPrompt,
63
+ agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
64
+ max_input_tokens: int = 128000,
65
+ validate_output: bool = False,
66
+ message_context: Optional[str] = None,
67
+ generate_gif: bool | str = True,
68
+ sensitive_data: Optional[Dict[str, str]] = None,
69
+ available_file_paths: Optional[list[str]] = None,
70
+ include_attributes: list[str] = [
71
+ 'title',
72
+ 'type',
73
+ 'name',
74
+ 'role',
75
+ 'tabindex',
76
+ 'aria-label',
77
+ 'placeholder',
78
+ 'value',
79
+ 'alt',
80
+ 'aria-expanded',
81
+ ],
82
+ max_error_length: int = 400,
83
+ max_actions_per_step: int = 10,
84
+ tool_call_in_content: bool = True,
85
+ initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
86
+ # Cloud Callbacks
87
+ register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], None] | None = None,
88
+ register_done_callback: Callable[['AgentHistoryList'], None] | None = None,
89
+ tool_calling_method: Optional[str] = 'auto',
90
+ page_extraction_llm: Optional[BaseChatModel] = None,
91
+ planner_llm: Optional[BaseChatModel] = None,
92
+ planner_interval: int = 1, # Run planner every N steps
93
+ ):
94
+ super().__init__(
95
+ task=task,
96
+ llm=llm,
97
+ browser=browser,
98
+ browser_context=browser_context,
99
+ controller=controller,
100
+ use_vision=use_vision,
101
+ use_vision_for_planner=use_vision_for_planner,
102
+ save_conversation_path=save_conversation_path,
103
+ save_conversation_path_encoding=save_conversation_path_encoding,
104
+ max_failures=max_failures,
105
+ retry_delay=retry_delay,
106
+ system_prompt_class=system_prompt_class,
107
+ max_input_tokens=max_input_tokens,
108
+ validate_output=validate_output,
109
+ message_context=message_context,
110
+ generate_gif=generate_gif,
111
+ sensitive_data=sensitive_data,
112
+ available_file_paths=available_file_paths,
113
+ include_attributes=include_attributes,
114
+ max_error_length=max_error_length,
115
+ max_actions_per_step=max_actions_per_step,
116
+ tool_call_in_content=tool_call_in_content,
117
+ initial_actions=initial_actions,
118
+ register_new_step_callback=register_new_step_callback,
119
+ register_done_callback=register_done_callback,
120
+ tool_calling_method=tool_calling_method,
121
+ planner_llm=planner_llm,
122
+ planner_interval=planner_interval
123
+ )
124
+ if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
125
+ # deepseek-reasoner does not support function calling
126
+ self.use_deepseek_r1 = True
127
+ # deepseek-reasoner only support 64000 context
128
+ self.max_input_tokens = 64000
129
+ else:
130
+ self.use_deepseek_r1 = False
131
+
132
+ # record last actions
133
+ self._last_actions = None
134
+ # record extract content
135
+ self.extracted_content = ""
136
+ # custom new info
137
+ self.add_infos = add_infos
138
+
139
+ self.agent_prompt_class = agent_prompt_class
140
+ self.message_manager = CustomMessageManager(
141
+ llm=self.llm,
142
+ task=self.task,
143
+ action_descriptions=self.controller.registry.get_prompt_description(),
144
+ system_prompt_class=self.system_prompt_class,
145
+ agent_prompt_class=agent_prompt_class,
146
+ max_input_tokens=self.max_input_tokens,
147
+ include_attributes=self.include_attributes,
148
+ max_error_length=self.max_error_length,
149
+ max_actions_per_step=self.max_actions_per_step,
150
+ message_context=self.message_context,
151
+ sensitive_data=self.sensitive_data
152
+ )
153
+
154
+ def _setup_action_models(self) -> None:
155
+ """Setup dynamic action models from controller's registry"""
156
+ # Get the dynamic action model from controller's registry
157
+ self.ActionModel = self.controller.registry.create_action_model()
158
+ # Create output model with the dynamic actions
159
+ self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
160
+
161
+ def _log_response(self, response: CustomAgentOutput) -> None:
162
+ """Log the model's response"""
163
+ if "Success" in response.current_state.prev_action_evaluation:
164
+ emoji = "✅"
165
+ elif "Failed" in response.current_state.prev_action_evaluation:
166
+ emoji = "❌"
167
+ else:
168
+ emoji = "🤷"
169
+
170
+ logger.info(f"{emoji} Eval: {response.current_state.prev_action_evaluation}")
171
+ logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
172
+ logger.info(f"⏳ Task Progress: \n{response.current_state.task_progress}")
173
+ logger.info(f"📋 Future Plans: \n{response.current_state.future_plans}")
174
+ logger.info(f"🤔 Thought: {response.current_state.thought}")
175
+ logger.info(f"🎯 Summary: {response.current_state.summary}")
176
+ for i, action in enumerate(response.action):
177
+ logger.info(
178
+ f"🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
179
+ )
180
+
181
+ def update_step_info(
182
+ self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
183
+ ):
184
+ """
185
+ update step info
186
+ """
187
+ if step_info is None:
188
+ return
189
+
190
+ step_info.step_number += 1
191
+ important_contents = model_output.current_state.important_contents
192
+ if (
193
+ important_contents
194
+ and "None" not in important_contents
195
+ and important_contents not in step_info.memory
196
+ ):
197
+ step_info.memory += important_contents + "\n"
198
+
199
+ task_progress = model_output.current_state.task_progress
200
+ if task_progress and "None" not in task_progress:
201
+ step_info.task_progress = task_progress
202
+
203
+ future_plans = model_output.current_state.future_plans
204
+ if future_plans and "None" not in future_plans:
205
+ step_info.future_plans = future_plans
206
+
207
+ logger.info(f"🧠 All Memory: \n{step_info.memory}")
208
+
209
+ @time_execution_async("--get_next_action")
210
+ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
211
+ """Get next action from LLM based on current state"""
212
+
213
+ ai_message = self.llm.invoke(input_messages)
214
+ self.message_manager._add_message_with_tokens(ai_message)
215
+
216
+ if hasattr(ai_message, "reasoning_content"):
217
+ logger.info("🤯 Start Deep Thinking: ")
218
+ logger.info(ai_message.reasoning_content)
219
+ logger.info("🤯 End Deep Thinking")
220
+
221
+ if isinstance(ai_message.content, list):
222
+ ai_content = ai_message.content[0]
223
+ else:
224
+ ai_content = ai_message.content
225
+
226
+ ai_content = ai_content.replace("```json", "").replace("```", "")
227
+ ai_content = repair_json(ai_content)
228
+ parsed_json = json.loads(ai_content)
229
+ parsed: AgentOutput = self.AgentOutput(**parsed_json)
230
+
231
+ if parsed is None:
232
+ logger.debug(ai_message.content)
233
+ raise ValueError('Could not parse response.')
234
+
235
+ # Limit actions to maximum allowed per step
236
+ parsed.action = parsed.action[: self.max_actions_per_step]
237
+ self._log_response(parsed)
238
+ self.n_steps += 1
239
+
240
+ return parsed
241
+
242
+ async def _run_planner(self) -> Optional[str]:
243
+ """Run the planner to analyze state and suggest next steps"""
244
+ # Skip planning if no planner_llm is set
245
+ if not self.planner_llm:
246
+ return None
247
+
248
+ # Create planner message history using full message history
249
+ planner_messages = [
250
+ PlannerPrompt(self.action_descriptions).get_system_message(),
251
+ *self.message_manager.get_messages()[1:], # Use full message history except the first
252
+ ]
253
+
254
+ if not self.use_vision_for_planner and self.use_vision:
255
+ last_state_message = planner_messages[-1]
256
+ # remove image from last state message
257
+ new_msg = ''
258
+ if isinstance(last_state_message.content, list):
259
+ for msg in last_state_message.content:
260
+ if msg['type'] == 'text':
261
+ new_msg += msg['text']
262
+ elif msg['type'] == 'image_url':
263
+ continue
264
+ else:
265
+ new_msg = last_state_message.content
266
+
267
+ planner_messages[-1] = HumanMessage(content=new_msg)
268
+
269
+ # Get planner output
270
+ response = await self.planner_llm.ainvoke(planner_messages)
271
+ plan = response.content
272
+ last_state_message = planner_messages[-1]
273
+ # remove image from last state message
274
+ if isinstance(last_state_message.content, list):
275
+ for msg in last_state_message.content:
276
+ if msg['type'] == 'text':
277
+ msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
278
+ else:
279
+ last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
280
+
281
+ try:
282
+ plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
283
+ logger.info(f'📋 Plans:\n{json.dumps(plan_json, indent=4)}')
284
+
285
+ if hasattr(response, "reasoning_content"):
286
+ logger.info("🤯 Start Planning Deep Thinking: ")
287
+ logger.info(response.reasoning_content)
288
+ logger.info("🤯 End Planning Deep Thinking")
289
+
290
+ except json.JSONDecodeError:
291
+ logger.info(f'📋 Plans:\n{plan}')
292
+ except Exception as e:
293
+ logger.debug(f'Error parsing planning analysis: {e}')
294
+ logger.info(f'📋 Plans: {plan}')
295
+
296
+ @time_execution_async("--step")
297
+ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
298
+ """Execute one step of the task"""
299
+ logger.info(f"\n📍 Step {self.n_steps}")
300
+ state = None
301
+ model_output = None
302
+ result: list[ActionResult] = []
303
+ actions: list[ActionModel] = []
304
+
305
+ try:
306
+ state = await self.browser_context.get_state()
307
+ self._check_if_stopped_or_paused()
308
+
309
+ self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info,
310
+ self.use_vision)
311
+
312
+ # Run planner at specified intervals if planner is configured
313
+ if self.planner_llm and self.n_steps % self.planning_interval == 0:
314
+ await self._run_planner()
315
+ input_messages = self.message_manager.get_messages()
316
+ self._check_if_stopped_or_paused()
317
+ try:
318
+ model_output = await self.get_next_action(input_messages)
319
+ if self.register_new_step_callback:
320
+ self.register_new_step_callback(state, model_output, self.n_steps)
321
+ self.update_step_info(model_output, step_info)
322
+ self._save_conversation(input_messages, model_output)
323
+ if self.model_name != "deepseek-reasoner":
324
+ # remove prev message
325
+ self.message_manager._remove_state_message_by_index(-1)
326
+ self._check_if_stopped_or_paused()
327
+ except Exception as e:
328
+ # model call failed, remove last state message from history
329
+ self.message_manager._remove_state_message_by_index(-1)
330
+ raise e
331
+
332
+ actions: list[ActionModel] = model_output.action
333
+ result: list[ActionResult] = await self.controller.multi_act(
334
+ actions,
335
+ self.browser_context,
336
+ page_extraction_llm=self.page_extraction_llm,
337
+ sensitive_data=self.sensitive_data,
338
+ check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
339
+ available_file_paths=self.available_file_paths,
340
+ )
341
+ if len(result) != len(actions):
342
+ # I think something changes, such information should let LLM know
343
+ for ri in range(len(result), len(actions)):
344
+ result.append(ActionResult(extracted_content=None,
345
+ include_in_memory=True,
346
+ error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
347
+ Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
348
+ is_done=False))
349
+ for ret_ in result:
350
+ if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
351
+ # record every extracted page
352
+ self.extracted_content += ret_.extracted_content
353
+ self._last_result = result
354
+ self._last_actions = actions
355
+ if len(result) > 0 and result[-1].is_done:
356
+ if not self.extracted_content:
357
+ self.extracted_content = step_info.memory
358
+ result[-1].extracted_content = self.extracted_content
359
+ logger.info(f"📄 Result: {result[-1].extracted_content}")
360
+
361
+ self.consecutive_failures = 0
362
+
363
+ except Exception as e:
364
+ result = await self._handle_step_error(e)
365
+ self._last_result = result
366
+
367
+ finally:
368
+ actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
369
+ self.telemetry.capture(
370
+ AgentStepTelemetryEvent(
371
+ agent_id=self.agent_id,
372
+ step=self.n_steps,
373
+ actions=actions,
374
+ consecutive_failures=self.consecutive_failures,
375
+ step_error=[r.error for r in result if r.error] if result else ['No result'],
376
+ )
377
+ )
378
+ if not result:
379
+ return
380
+
381
+ if state:
382
+ self._make_history_item(model_output, state, result)
383
+
384
+ async def run(self, max_steps: int = 100) -> AgentHistoryList:
385
+ """Execute the task with maximum number of steps"""
386
+ try:
387
+ self._log_agent_run()
388
+
389
+ # Execute initial actions if provided
390
+ if self.initial_actions:
391
+ result = await self.controller.multi_act(
392
+ self.initial_actions,
393
+ self.browser_context,
394
+ check_for_new_elements=False,
395
+ page_extraction_llm=self.page_extraction_llm,
396
+ check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
397
+ available_file_paths=self.available_file_paths,
398
+ )
399
+ self._last_result = result
400
+
401
+ step_info = CustomAgentStepInfo(
402
+ task=self.task,
403
+ add_infos=self.add_infos,
404
+ step_number=1,
405
+ max_steps=max_steps,
406
+ memory="",
407
+ task_progress="",
408
+ future_plans=""
409
+ )
410
+
411
+ for step in range(max_steps):
412
+ if self._too_many_failures():
413
+ break
414
+
415
+ # 3) Do the step
416
+ await self.step(step_info)
417
+
418
+ if self.history.is_done():
419
+ if (
420
+ self.validate_output and step < max_steps - 1
421
+ ): # if last step, we dont need to validate
422
+ if not await self._validate_output():
423
+ continue
424
+
425
+ logger.info("✅ Task completed successfully")
426
+ break
427
+ else:
428
+ logger.info("❌ Failed to complete task in maximum steps")
429
+ if not self.extracted_content:
430
+ self.history.history[-1].result[-1].extracted_content = step_info.memory
431
+ else:
432
+ self.history.history[-1].result[-1].extracted_content = self.extracted_content
433
+
434
+ return self.history
435
+
436
+ finally:
437
+ self.telemetry.capture(
438
+ AgentEndTelemetryEvent(
439
+ agent_id=self.agent_id,
440
+ success=self.history.is_done(),
441
+ steps=self.n_steps,
442
+ max_steps_reached=self.n_steps >= max_steps,
443
+ errors=self.history.errors(),
444
+ )
445
+ )
446
+
447
+ if not self.injected_browser_context:
448
+ await self.browser_context.close()
449
+
450
+ if not self.injected_browser and self.browser:
451
+ await self.browser.close()
452
+
453
+ if self.generate_gif:
454
+ output_path: str = 'agent_history.gif'
455
+ if isinstance(self.generate_gif, str):
456
+ output_path = self.generate_gif
457
+
458
+ self.create_history_gif(output_path=output_path)
459
+
460
+ def create_history_gif(
461
+ self,
462
+ output_path: str = 'agent_history.gif',
463
+ duration: int = 3000,
464
+ show_goals: bool = True,
465
+ show_task: bool = True,
466
+ show_logo: bool = False,
467
+ font_size: int = 40,
468
+ title_font_size: int = 56,
469
+ goal_font_size: int = 44,
470
+ margin: int = 40,
471
+ line_spacing: float = 1.5,
472
+ ) -> None:
473
+ """Create a GIF from the agent's history with overlaid task and goal text."""
474
+ if not self.history.history:
475
+ logger.warning('No history to create GIF from')
476
+ return
477
+
478
+ images = []
479
+ # if history is empty or first screenshot is None, we can't create a gif
480
+ if not self.history.history or not self.history.history[0].state.screenshot:
481
+ logger.warning('No history or first screenshot to create GIF from')
482
+ return
483
+
484
+ # Try to load nicer fonts
485
+ try:
486
+ # Try different font options in order of preference
487
+ font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
488
+ font_loaded = False
489
+
490
+ for font_name in font_options:
491
+ try:
492
+ if platform.system() == 'Windows':
493
+ # Need to specify the abs font path on Windows
494
+ font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
495
+ regular_font = ImageFont.truetype(font_name, font_size)
496
+ title_font = ImageFont.truetype(font_name, title_font_size)
497
+ goal_font = ImageFont.truetype(font_name, goal_font_size)
498
+ font_loaded = True
499
+ break
500
+ except OSError:
501
+ continue
502
+
503
+ if not font_loaded:
504
+ raise OSError('No preferred fonts found')
505
+
506
+ except OSError:
507
+ regular_font = ImageFont.load_default()
508
+ title_font = ImageFont.load_default()
509
+
510
+ goal_font = regular_font
511
+
512
+ # Load logo if requested
513
+ logo = None
514
+ if show_logo:
515
+ try:
516
+ logo = Image.open('./static/browser-use.png')
517
+ # Resize logo to be small (e.g., 40px height)
518
+ logo_height = 150
519
+ aspect_ratio = logo.width / logo.height
520
+ logo_width = int(logo_height * aspect_ratio)
521
+ logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
522
+ except Exception as e:
523
+ logger.warning(f'Could not load logo: {e}')
524
+
525
+ # Create task frame if requested
526
+ if show_task and self.task:
527
+ task_frame = self._create_task_frame(
528
+ self.task,
529
+ self.history.history[0].state.screenshot,
530
+ title_font,
531
+ regular_font,
532
+ logo,
533
+ line_spacing,
534
+ )
535
+ images.append(task_frame)
536
+
537
+ # Process each history item
538
+ for i, item in enumerate(self.history.history, 1):
539
+ if not item.state.screenshot:
540
+ continue
541
+
542
+ # Convert base64 screenshot to PIL Image
543
+ img_data = base64.b64decode(item.state.screenshot)
544
+ image = Image.open(io.BytesIO(img_data))
545
+
546
+ if show_goals and item.model_output:
547
+ image = self._add_overlay_to_image(
548
+ image=image,
549
+ step_number=i,
550
+ goal_text=item.model_output.current_state.thought,
551
+ regular_font=regular_font,
552
+ title_font=title_font,
553
+ margin=margin,
554
+ logo=logo,
555
+ )
556
+
557
+ images.append(image)
558
+
559
+ if images:
560
+ # Save the GIF
561
+ images[0].save(
562
+ output_path,
563
+ save_all=True,
564
+ append_images=images[1:],
565
+ duration=duration,
566
+ loop=0,
567
+ optimize=False,
568
+ )
569
+ logger.info(f'Created GIF at {output_path}')
570
+ else:
571
+ logger.warning('No images found in history to create GIF')
src/agent/custom_message_manager.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import List, Optional, Type, Dict
5
+
6
+ from browser_use.agent.message_manager.service import MessageManager
7
+ from browser_use.agent.message_manager.views import MessageHistory
8
+ from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
9
+ from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
10
+ from browser_use.browser.views import BrowserState
11
+ from langchain_core.language_models import BaseChatModel
12
+ from langchain_anthropic import ChatAnthropic
13
+ from langchain_core.language_models import BaseChatModel
14
+ from langchain_core.messages import (
15
+ AIMessage,
16
+ BaseMessage,
17
+ HumanMessage,
18
+ ToolMessage
19
+ )
20
+ from langchain_openai import ChatOpenAI
21
+ from ..utils.llm import DeepSeekR1ChatOpenAI
22
+ from .custom_prompts import CustomAgentMessagePrompt
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class CustomMessageManager(MessageManager):
28
+ def __init__(
29
+ self,
30
+ llm: BaseChatModel,
31
+ task: str,
32
+ action_descriptions: str,
33
+ system_prompt_class: Type[SystemPrompt],
34
+ agent_prompt_class: Type[AgentMessagePrompt],
35
+ max_input_tokens: int = 128000,
36
+ estimated_characters_per_token: int = 3,
37
+ image_tokens: int = 800,
38
+ include_attributes: list[str] = [],
39
+ max_error_length: int = 400,
40
+ max_actions_per_step: int = 10,
41
+ message_context: Optional[str] = None,
42
+ sensitive_data: Optional[Dict[str, str]] = None,
43
+ ):
44
+ super().__init__(
45
+ llm=llm,
46
+ task=task,
47
+ action_descriptions=action_descriptions,
48
+ system_prompt_class=system_prompt_class,
49
+ max_input_tokens=max_input_tokens,
50
+ estimated_characters_per_token=estimated_characters_per_token,
51
+ image_tokens=image_tokens,
52
+ include_attributes=include_attributes,
53
+ max_error_length=max_error_length,
54
+ max_actions_per_step=max_actions_per_step,
55
+ message_context=message_context,
56
+ sensitive_data=sensitive_data
57
+ )
58
+ self.agent_prompt_class = agent_prompt_class
59
+ # Custom: Move Task info to state_message
60
+ self.history = MessageHistory()
61
+ self._add_message_with_tokens(self.system_prompt)
62
+
63
+ if self.message_context:
64
+ context_message = HumanMessage(content=self.message_context)
65
+ self._add_message_with_tokens(context_message)
66
+
67
+ def cut_messages(self):
68
+ """Get current message list, potentially trimmed to max tokens"""
69
+ diff = self.history.total_tokens - self.max_input_tokens
70
+ min_message_len = 2 if self.message_context is not None else 1
71
+
72
+ while diff > 0 and len(self.history.messages) > min_message_len:
73
+ self.history.remove_message(min_message_len) # always remove the oldest message
74
+ diff = self.history.total_tokens - self.max_input_tokens
75
+
76
+ def add_state_message(
77
+ self,
78
+ state: BrowserState,
79
+ actions: Optional[List[ActionModel]] = None,
80
+ result: Optional[List[ActionResult]] = None,
81
+ step_info: Optional[AgentStepInfo] = None,
82
+ use_vision=True,
83
+ ) -> None:
84
+ """Add browser state as human message"""
85
+ # otherwise add state message and result to next message (which will not stay in memory)
86
+ state_message = self.agent_prompt_class(
87
+ state,
88
+ actions,
89
+ result,
90
+ include_attributes=self.include_attributes,
91
+ max_error_length=self.max_error_length,
92
+ step_info=step_info,
93
+ ).get_user_message(use_vision)
94
+ self._add_message_with_tokens(state_message)
95
+
96
+ def _count_text_tokens(self, text: str) -> int:
97
+ if isinstance(self.llm, (ChatOpenAI, ChatAnthropic, DeepSeekR1ChatOpenAI)):
98
+ try:
99
+ tokens = self.llm.get_num_tokens(text)
100
+ except Exception:
101
+ tokens = (
102
+ len(text) // self.estimated_characters_per_token
103
+ ) # Rough estimate if no tokenizer available
104
+ else:
105
+ tokens = (
106
+ len(text) // self.estimated_characters_per_token
107
+ ) # Rough estimate if no tokenizer available
108
+ return tokens
109
+
110
+ def _remove_state_message_by_index(self, remove_ind=-1) -> None:
111
+ """Remove last state message from history"""
112
+ i = len(self.history.messages) - 1
113
+ remove_cnt = 0
114
+ while i >= 0:
115
+ if isinstance(self.history.messages[i].message, HumanMessage):
116
+ remove_cnt += 1
117
+ if remove_cnt == abs(remove_ind):
118
+ self.history.remove_message(i)
119
+ break
120
+ i -= 1
src/agent/custom_prompts.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ from typing import List, Optional
3
+
4
+ from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
5
+ from browser_use.agent.views import ActionResult, ActionModel
6
+ from browser_use.browser.views import BrowserState
7
+ from langchain_core.messages import HumanMessage, SystemMessage
8
+ from datetime import datetime
9
+
10
+ from .custom_views import CustomAgentStepInfo
11
+
12
+
13
+ class CustomSystemPrompt(SystemPrompt):
14
+ def important_rules(self) -> str:
15
+ """
16
+ Returns the important rules for the agent.
17
+ """
18
+ text = r"""
19
+ 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
20
+ {
21
+ "current_state": {
22
+ "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
23
+ "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
24
+ "task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
25
+ "future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
26
+ "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
27
+ "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
28
+ },
29
+ "action": [
30
+ * actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}*
31
+ ]
32
+ }
33
+
34
+ 2. ACTIONS: You can specify multiple actions to be executed in sequence.
35
+
36
+ Common action sequences:
37
+ - Form filling: [
38
+ {"input_text": {"index": 1, "text": "username"}},
39
+ {"input_text": {"index": 2, "text": "password"}},
40
+ {"click_element": {"index": 3}}
41
+ ]
42
+ - Navigation and extraction: [
43
+ {"go_to_url": {"url": "https://example.com"}},
44
+ {"extract_page_content": {}}
45
+ ]
46
+
47
+
48
+ 3. ELEMENT INTERACTION:
49
+ - Only use indexes that exist in the provided element list
50
+ - Each element has a unique index number (e.g., "33[:]<button>")
51
+ - Elements marked with "_[:]" are non-interactive (for context only)
52
+
53
+ 4. NAVIGATION & ERROR HANDLING:
54
+ - If no suitable elements exist, use other functions to complete the task
55
+ - If stuck, try alternative approaches
56
+ - Handle popups/cookies by accepting or closing them
57
+ - Use scroll to find elements you are looking for
58
+
59
+ 5. TASK COMPLETION:
60
+ - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
61
+ - Don't hallucinate actions.
62
+ - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
63
+ - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
64
+ - Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.
65
+
66
+ 6. VISUAL CONTEXT:
67
+ - When an image is provided, use it to understand the page layout
68
+ - Bounding boxes with labels correspond to element indexes
69
+ - Each bounding box and its label have the same color
70
+ - Most often the label is inside the bounding box, on the top right
71
+ - Visual context helps verify element locations and relationships
72
+ - sometimes labels overlap, so use the context to verify the correct element
73
+
74
+ 7. Form filling:
75
+ - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
76
+
77
+ 8. ACTION SEQUENCING:
78
+ - Actions are executed in the order they appear in the list
79
+ - Each action should logically follow from the previous one
80
+ - If the page changes after an action, the sequence is interrupted and you get the new state.
81
+ - If content only disappears the sequence continues.
82
+ - Only provide the action sequence until you think the page will change.
83
+ - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
84
+ - only use multiple actions if it makes sense.
85
+
86
+ 9. Extraction:
87
+ - If your task is to find information or do research - call extract_content on the specific pages to get and store the information.
88
+
89
+ """
90
+ text += f" - use maximum {self.max_actions_per_step} actions per sequence"
91
+ return text
92
+
93
+ def input_format(self) -> str:
94
+ return """
95
+ INPUT STRUCTURE:
96
+ 1. Task: The user\'s instructions you need to complete.
97
+ 2. Hints(Optional): Some hints to help you complete the user\'s instructions.
98
+ 3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
99
+ 4. Current URL: The webpage you're currently on
100
+ 5. Available Tabs: List of open browser tabs
101
+ 6. Interactive Elements: List in the format:
102
+ [index]<element_type>element_text</element_type>
103
+ - index: Numeric identifier for interaction
104
+ - element_type: HTML element type (button, input, etc.)
105
+ - element_text: Visible text or element description
106
+
107
+ Example:
108
+ [33]<button>Submit Form</button>
109
+ [] Non-interactive text
110
+
111
+
112
+ Notes:
113
+ - Only elements with numeric indexes inside [] are interactive
114
+ - [] elements provide context but cannot be interacted with
115
+ """
116
+
117
+
118
+ class CustomAgentMessagePrompt(AgentMessagePrompt):
119
+ def __init__(
120
+ self,
121
+ state: BrowserState,
122
+ actions: Optional[List[ActionModel]] = None,
123
+ result: Optional[List[ActionResult]] = None,
124
+ include_attributes: list[str] = [],
125
+ max_error_length: int = 400,
126
+ step_info: Optional[CustomAgentStepInfo] = None,
127
+ ):
128
+ super(CustomAgentMessagePrompt, self).__init__(state=state,
129
+ result=result,
130
+ include_attributes=include_attributes,
131
+ max_error_length=max_error_length,
132
+ step_info=step_info
133
+ )
134
+ self.actions = actions
135
+
136
+ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
137
+ if self.step_info:
138
+ step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
139
+ else:
140
+ step_info_description = ''
141
+
142
+ time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
143
+ step_info_description += f"Current date and time: {time_str}"
144
+
145
+ elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
146
+
147
+ has_content_above = (self.state.pixels_above or 0) > 0
148
+ has_content_below = (self.state.pixels_below or 0) > 0
149
+
150
+ if elements_text != '':
151
+ if has_content_above:
152
+ elements_text = (
153
+ f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
154
+ )
155
+ else:
156
+ elements_text = f'[Start of page]\n{elements_text}'
157
+ if has_content_below:
158
+ elements_text = (
159
+ f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
160
+ )
161
+ else:
162
+ elements_text = f'{elements_text}\n[End of page]'
163
+ else:
164
+ elements_text = 'empty page'
165
+
166
+ state_description = f"""
167
+ {step_info_description}
168
+ 1. Task: {self.step_info.task}.
169
+ 2. Hints(Optional):
170
+ {self.step_info.add_infos}
171
+ 3. Memory:
172
+ {self.step_info.memory}
173
+ 4. Current url: {self.state.url}
174
+ 5. Available tabs:
175
+ {self.state.tabs}
176
+ 6. Interactive elements:
177
+ {elements_text}
178
+ """
179
+
180
+ if self.actions and self.result:
181
+ state_description += "\n **Previous Actions** \n"
182
+ state_description += f'Previous step: {self.step_info.step_number-1}/{self.step_info.max_steps} \n'
183
+ for i, result in enumerate(self.result):
184
+ action = self.actions[i]
185
+ state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
186
+ if result.include_in_memory:
187
+ if result.extracted_content:
188
+ state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
189
+ if result.error:
190
+ # only use last 300 characters of error
191
+ error = result.error[-self.max_error_length:]
192
+ state_description += (
193
+ f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
194
+ )
195
+
196
+ if self.state.screenshot and use_vision == True:
197
+ # Format message for vision model
198
+ return HumanMessage(
199
+ content=[
200
+ {'type': 'text', 'text': state_description},
201
+ {
202
+ 'type': 'image_url',
203
+ 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
204
+ },
205
+ ]
206
+ )
207
+
208
+ return HumanMessage(content=state_description)
src/agent/custom_views.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Type
3
+
4
+ from browser_use.agent.views import AgentOutput
5
+ from browser_use.controller.registry.views import ActionModel
6
+ from pydantic import BaseModel, ConfigDict, Field, create_model
7
+
8
+
9
+ @dataclass
10
+ class CustomAgentStepInfo:
11
+ step_number: int
12
+ max_steps: int
13
+ task: str
14
+ add_infos: str
15
+ memory: str
16
+ task_progress: str
17
+ future_plans: str
18
+
19
+
20
+ class CustomAgentBrain(BaseModel):
21
+ """Current state of the agent"""
22
+
23
+ prev_action_evaluation: str
24
+ important_contents: str
25
+ task_progress: str
26
+ future_plans: str
27
+ thought: str
28
+ summary: str
29
+
30
+
31
+ class CustomAgentOutput(AgentOutput):
32
+ """Output model for agent
33
+
34
+ @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
35
+ """
36
+
37
+ model_config = ConfigDict(arbitrary_types_allowed=True)
38
+
39
+ current_state: CustomAgentBrain
40
+ action: list[ActionModel]
41
+
42
+ @staticmethod
43
+ def type_with_custom_actions(
44
+ custom_actions: Type[ActionModel],
45
+ ) -> Type["CustomAgentOutput"]:
46
+ """Extend actions with custom actions"""
47
+ return create_model(
48
+ "CustomAgentOutput",
49
+ __base__=CustomAgentOutput,
50
+ action=(
51
+ list[custom_actions],
52
+ Field(...),
53
+ ), # Properly annotated field with no default
54
+ __module__=CustomAgentOutput.__module__,
55
+ )
src/browser/__init__.py ADDED
File without changes
src/browser/custom_browser.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pdb
3
+
4
+ from playwright.async_api import Browser as PlaywrightBrowser
5
+ from playwright.async_api import (
6
+ BrowserContext as PlaywrightBrowserContext,
7
+ )
8
+ from playwright.async_api import (
9
+ Playwright,
10
+ async_playwright,
11
+ )
12
+ from browser_use.browser.browser import Browser
13
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
14
+ from playwright.async_api import BrowserContext as PlaywrightBrowserContext
15
+ import logging
16
+
17
+ from .custom_context import CustomBrowserContext
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class CustomBrowser(Browser):
22
+
23
+ async def new_context(
24
+ self,
25
+ config: BrowserContextConfig = BrowserContextConfig()
26
+ ) -> CustomBrowserContext:
27
+ return CustomBrowserContext(config=config, browser=self)
src/browser/custom_context.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+
5
+ from browser_use.browser.browser import Browser
6
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
7
+ from playwright.async_api import Browser as PlaywrightBrowser
8
+ from playwright.async_api import BrowserContext as PlaywrightBrowserContext
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class CustomBrowserContext(BrowserContext):
14
+ def __init__(
15
+ self,
16
+ browser: "Browser",
17
+ config: BrowserContextConfig = BrowserContextConfig()
18
+ ):
19
+ super(CustomBrowserContext, self).__init__(browser=browser, config=config)
src/controller/__init__.py ADDED
File without changes
src/controller/custom_controller.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ import pyperclip
4
+ from typing import Optional, Type
5
+ from pydantic import BaseModel
6
+ from browser_use.agent.views import ActionResult
7
+ from browser_use.browser.context import BrowserContext
8
+ from browser_use.controller.service import Controller, DoneAction
9
+ from main_content_extractor import MainContentExtractor
10
+ from browser_use.controller.views import (
11
+ ClickElementAction,
12
+ DoneAction,
13
+ ExtractPageContentAction,
14
+ GoToUrlAction,
15
+ InputTextAction,
16
+ OpenTabAction,
17
+ ScrollAction,
18
+ SearchGoogleAction,
19
+ SendKeysAction,
20
+ SwitchTabAction,
21
+ )
22
+ import logging
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class CustomController(Controller):
28
+ def __init__(self, exclude_actions: list[str] = [],
29
+ output_model: Optional[Type[BaseModel]] = None
30
+ ):
31
+ super().__init__(exclude_actions=exclude_actions, output_model=output_model)
32
+ self._register_custom_actions()
33
+
34
+ def _register_custom_actions(self):
35
+ """Register all custom browser actions"""
36
+
37
+ @self.registry.action("Copy text to clipboard")
38
+ def copy_to_clipboard(text: str):
39
+ pyperclip.copy(text)
40
+ return ActionResult(extracted_content=text)
41
+
42
+ @self.registry.action("Paste text from clipboard")
43
+ async def paste_from_clipboard(browser: BrowserContext):
44
+ text = pyperclip.paste()
45
+ # send text to browser
46
+ page = await browser.get_current_page()
47
+ await page.keyboard.type(text)
48
+
49
+ return ActionResult(extracted_content=text)
src/utils/__init__.py ADDED
File without changes
src/utils/agent_state.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ class AgentState:
4
+ _instance = None
5
+
6
+ def __init__(self):
7
+ if not hasattr(self, '_stop_requested'):
8
+ self._stop_requested = asyncio.Event()
9
+ self.last_valid_state = None # store the last valid browser state
10
+
11
+ def __new__(cls):
12
+ if cls._instance is None:
13
+ cls._instance = super(AgentState, cls).__new__(cls)
14
+ return cls._instance
15
+
16
+ def request_stop(self):
17
+ self._stop_requested.set()
18
+
19
+ def clear_stop(self):
20
+ self._stop_requested.clear()
21
+ self.last_valid_state = None
22
+
23
+ def is_stop_requested(self):
24
+ return self._stop_requested.is_set()
25
+
26
+ def set_last_valid_state(self, state):
27
+ self.last_valid_state = state
28
+
29
+ def get_last_valid_state(self):
30
+ return self.last_valid_state
src/utils/deep_research.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import asyncio
7
+ import os
8
+ import sys
9
+ import logging
10
+ from pprint import pprint
11
+ from uuid import uuid4
12
+ from src.utils import utils
13
+ from src.agent.custom_agent import CustomAgent
14
+ import json
15
+ import re
16
+ from browser_use.agent.service import Agent
17
+ from browser_use.browser.browser import BrowserConfig, Browser
18
+ from browser_use.agent.views import ActionResult
19
+ from browser_use.browser.context import BrowserContext
20
+ from browser_use.controller.service import Controller, DoneAction
21
+ from main_content_extractor import MainContentExtractor
22
+ from langchain.schema import SystemMessage, HumanMessage
23
+ from json_repair import repair_json
24
+ from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
25
+ from src.controller.custom_controller import CustomController
26
+ from src.browser.custom_browser import CustomBrowser
27
+ from src.browser.custom_context import BrowserContextConfig, BrowserContext
28
+ from browser_use.browser.context import (
29
+ BrowserContextConfig,
30
+ BrowserContextWindowSize,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def deep_research(task, llm, agent_state=None, **kwargs):
37
+ task_id = str(uuid4())
38
+ save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}"))
39
+ logger.info(f"Save Deep Research at: {save_dir}")
40
+ os.makedirs(save_dir, exist_ok=True)
41
+
42
+ # max qyery num per iteration
43
+ max_query_num = kwargs.get("max_query_num", 3)
44
+
45
+ use_own_browser = kwargs.get("use_own_browser", False)
46
+ extra_chromium_args = []
47
+ if use_own_browser:
48
+ # TODO: if use own browser, max query num must be 1 per iter, how to solve it?
49
+ max_query_num = 1
50
+ chrome_path = os.getenv("CHROME_PATH", None)
51
+ if chrome_path == "":
52
+ chrome_path = None
53
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
54
+ if chrome_user_data:
55
+ extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
56
+
57
+ browser = CustomBrowser(
58
+ config=BrowserConfig(
59
+ headless=kwargs.get("headless", False),
60
+ disable_security=kwargs.get("disable_security", True),
61
+ chrome_instance_path=chrome_path,
62
+ extra_chromium_args=extra_chromium_args,
63
+ )
64
+ )
65
+ browser_context = await browser.new_context()
66
+ else:
67
+ browser = None
68
+ browser_context = None
69
+
70
+ controller = CustomController()
71
+
72
+ @controller.registry.action(
73
+ 'Extract page content to get the pure markdown.',
74
+ )
75
+ async def extract_content(browser: BrowserContext):
76
+ page = await browser.get_current_page()
77
+ # use jina reader
78
+ url = page.url
79
+
80
+ jina_url = f"https://r.jina.ai/{url}"
81
+ await page.goto(jina_url)
82
+ output_format = 'markdown'
83
+ content = MainContentExtractor.extract( # type: ignore
84
+ html=await page.content(),
85
+ output_format=output_format,
86
+ )
87
+ # go back to org url
88
+ await page.go_back()
89
+ msg = f'Extracted page content:\n {content}\n'
90
+ logger.info(msg)
91
+ return ActionResult(extracted_content=msg)
92
+
93
+ search_system_prompt = f"""
94
+ You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.
95
+
96
+ **Your Task:**
97
+
98
+ Given a user's research topic, you will:
99
+
100
+ 1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction.
101
+ 2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan.
102
+
103
+ **Output Format:**
104
+
105
+ Your output will be a JSON object with the following structure:
106
+
107
+ ```json
108
+ {{
109
+ "plan": "A concise, high-level research plan outlining the key areas to investigate.",
110
+ "queries": [
111
+ "search query 1",
112
+ "search query 2",
113
+ //... up to a maximum of {max_query_num} search queries
114
+ ]
115
+ }}
116
+ ```
117
+
118
+ **Important:**
119
+
120
+ * Limit your output to a **maximum of {max_query_num}** search queries.
121
+ * Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
122
+ * If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]`
123
+ * Make sure output search queries are different from the history queries.
124
+
125
+ **Inputs:**
126
+
127
+ 1. **User Instruction:** The original instruction given by the user.
128
+ 2. **Previous Queries:** History Queries.
129
+ 3. **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
130
+ """
131
+ search_messages = [SystemMessage(content=search_system_prompt)]
132
+
133
+ record_system_prompt = """
134
+ You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`.
135
+
136
+ **Important Considerations:**
137
+
138
+ 1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.**
139
+
140
+ 2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.
141
+
142
+ 3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`.
143
+
144
+ 4. **Thinking and Report Structure:** For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information.
145
+
146
+ **Output Format:**
147
+
148
+ Provide your output as a JSON formatted list. Each item in the list must adhere to the following format:
149
+
150
+ ```json
151
+ [
152
+ {
153
+ "url": "source_url_1",
154
+ "title": "source_title_1",
155
+ "summary_content": "Concise summary of content. Remember to include key data and figures here.",
156
+ "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic."
157
+ },
158
+ // ... more entries
159
+ {
160
+ "url": "unknown",
161
+ "title": "unknown",
162
+ "summary_content": "concise_summary_of_content_without_clear_source",
163
+ "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected."
164
+ }
165
+ ]
166
+ ```
167
+
168
+ **Inputs:**
169
+
170
+ 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
171
+ 2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string.
172
+ 3. **Current Search Plan:** Research plan for current search.
173
+ 4. **Current Search Query:** The current search query.
174
+ 5. **Current Search Results:** Textual data gathered from the most recent search query.
175
+ """
176
+ record_messages = [SystemMessage(content=record_system_prompt)]
177
+
178
+ search_iteration = 0
179
+ max_search_iterations = kwargs.get("max_search_iterations", 10) # Limit search iterations to prevent infinite loop
180
+ use_vision = kwargs.get("use_vision", False)
181
+
182
+ history_query = []
183
+ history_infos = []
184
+ try:
185
+ while search_iteration < max_search_iterations:
186
+ search_iteration += 1
187
+ logger.info(f"Start {search_iteration}th Search...")
188
+ history_query_ = json.dumps(history_query, indent=4)
189
+ history_infos_ = json.dumps(history_infos, indent=4)
190
+ query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n"
191
+ search_messages.append(HumanMessage(content=query_prompt))
192
+ ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:])
193
+ search_messages.append(ai_query_msg)
194
+ if hasattr(ai_query_msg, "reasoning_content"):
195
+ logger.info("🤯 Start Search Deep Thinking: ")
196
+ logger.info(ai_query_msg.reasoning_content)
197
+ logger.info("🤯 End Search Deep Thinking")
198
+ ai_query_content = ai_query_msg.content.replace("```json", "").replace("```", "")
199
+ ai_query_content = repair_json(ai_query_content)
200
+ ai_query_content = json.loads(ai_query_content)
201
+ query_plan = ai_query_content["plan"]
202
+ logger.info(f"Current Iteration {search_iteration} Planing:")
203
+ logger.info(query_plan)
204
+ query_tasks = ai_query_content["queries"]
205
+ if not query_tasks:
206
+ break
207
+ else:
208
+ query_tasks = query_tasks[:max_query_num]
209
+ history_query.extend(query_tasks)
210
+ logger.info("Query tasks:")
211
+ logger.info(query_tasks)
212
+
213
+ # 2. Perform Web Search and Auto exec
214
+ # Parallel BU agents
215
+ add_infos = "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" \
216
+ "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view.\n"
217
+ if use_own_browser:
218
+ agent = CustomAgent(
219
+ task=query_tasks[0],
220
+ llm=llm,
221
+ add_infos=add_infos,
222
+ browser=browser,
223
+ browser_context=browser_context,
224
+ use_vision=use_vision,
225
+ system_prompt_class=CustomSystemPrompt,
226
+ agent_prompt_class=CustomAgentMessagePrompt,
227
+ max_actions_per_step=5,
228
+ controller=controller
229
+ )
230
+ agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10))
231
+ query_results = [agent_result]
232
+ # Manually close all tab
233
+ session = await browser_context.get_session()
234
+ pages = session.context.pages
235
+ await browser_context.create_new_tab()
236
+ for page_id, page in enumerate(pages):
237
+ await page.close()
238
+
239
+ else:
240
+ agents = [CustomAgent(
241
+ task=task,
242
+ llm=llm,
243
+ add_infos=add_infos,
244
+ browser=browser,
245
+ browser_context=browser_context,
246
+ use_vision=use_vision,
247
+ system_prompt_class=CustomSystemPrompt,
248
+ agent_prompt_class=CustomAgentMessagePrompt,
249
+ max_actions_per_step=5,
250
+ controller=controller,
251
+ ) for task in query_tasks]
252
+ query_results = await asyncio.gather(
253
+ *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents])
254
+
255
+ if agent_state and agent_state.is_stop_requested():
256
+ # Stop
257
+ break
258
+ # 3. Summarize Search Result
259
+ query_result_dir = os.path.join(save_dir, "query_results")
260
+ os.makedirs(query_result_dir, exist_ok=True)
261
+ for i in range(len(query_tasks)):
262
+ query_result = query_results[i].final_result()
263
+ if not query_result:
264
+ continue
265
+ querr_save_path = os.path.join(query_result_dir, f"{search_iteration}-{i}.md")
266
+ logger.info(f"save query: {query_tasks[i]} at {querr_save_path}")
267
+ with open(querr_save_path, "w", encoding="utf-8") as fw:
268
+ fw.write(f"Query: {query_tasks[i]}\n")
269
+ fw.write(query_result)
270
+ # split query result in case the content is too long
271
+ query_results_split = query_result.split("Extracted page content:")
272
+ for qi, query_result_ in enumerate(query_results_split):
273
+ if not query_result_:
274
+ continue
275
+ else:
276
+ # TODO: limit content lenght: 128k tokens, ~3 chars per token
277
+ query_result_ = query_result_[:128000 * 3]
278
+ history_infos_ = json.dumps(history_infos, indent=4)
279
+ record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {history_infos_}\n Current Search Iteration: {search_iteration}\n Current Search Plan:\n{query_plan}\n Current Search Query:\n {query_tasks[i]}\n Current Search Results: {query_result_}\n "
280
+ record_messages.append(HumanMessage(content=record_prompt))
281
+ ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
282
+ record_messages.append(ai_record_msg)
283
+ if hasattr(ai_record_msg, "reasoning_content"):
284
+ logger.info("🤯 Start Record Deep Thinking: ")
285
+ logger.info(ai_record_msg.reasoning_content)
286
+ logger.info("🤯 End Record Deep Thinking")
287
+ record_content = ai_record_msg.content
288
+ record_content = repair_json(record_content)
289
+ new_record_infos = json.loads(record_content)
290
+ history_infos.extend(new_record_infos)
291
+ if agent_state and agent_state.is_stop_requested():
292
+ # Stop
293
+ break
294
+
295
+ logger.info("\nFinish Searching, Start Generating Report...")
296
+
297
+ # 5. Report Generation in Markdown (or JSON if you prefer)
298
+ return await generate_final_report(task, history_infos, save_dir, llm)
299
+
300
+ except Exception as e:
301
+ logger.error(f"Deep research Error: {e}")
302
+ return await generate_final_report(task, history_infos, save_dir, llm, str(e))
303
+ finally:
304
+ if browser:
305
+ await browser.close()
306
+ if browser_context:
307
+ await browser_context.close()
308
+ logger.info("Browser closed.")
309
+
310
+ async def generate_final_report(task, history_infos, save_dir, llm, error_msg=None):
311
+ """Generate report from collected information with error handling"""
312
+ try:
313
+ logger.info("\nAttempting to generate final report from collected data...")
314
+
315
+ writer_system_prompt = """
316
+ You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing.
317
+
318
+ **Specific Instructions:**
319
+
320
+ * **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
321
+ * **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
322
+ * **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report.
323
+ * **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
324
+ * **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
325
+ * **Data-Driven Comparisons with Tables:** **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.**
326
+ * **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
327
+ * **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
328
+ * **Reference List Formatting:** The reference list at the end must be formatted as follows:
329
+ `[1] Title (URL, if available)`
330
+ **Each reference must be separated by a blank line to ensure proper spacing.** For example:
331
+
332
+ ```
333
+ [1] Title 1 (URL1, if available)
334
+
335
+ [2] Title 2 (URL2, if available)
336
+ ```
337
+ **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.**
338
+ * **ABSOLUTE FINAL OUTPUT RESTRICTION:** **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).** **Your response will be deemed a failure if this instruction is not followed precisely.**
339
+
340
+ **Inputs:**
341
+
342
+ 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
343
+ 2. **Search Information:** Information gathered from the search queries.
344
+ """
345
+
346
+ history_infos_ = json.dumps(history_infos, indent=4)
347
+ record_json_path = os.path.join(save_dir, "record_infos.json")
348
+ logger.info(f"save All recorded information at {record_json_path}")
349
+ with open(record_json_path, "w") as fw:
350
+ json.dump(history_infos, fw, indent=4)
351
+ report_prompt = f"User Instruction:{task} \n Search Information:\n {history_infos_}"
352
+ report_messages = [SystemMessage(content=writer_system_prompt),
353
+ HumanMessage(content=report_prompt)] # New context for report generation
354
+ ai_report_msg = llm.invoke(report_messages)
355
+ if hasattr(ai_report_msg, "reasoning_content"):
356
+ logger.info("🤯 Start Report Deep Thinking: ")
357
+ logger.info(ai_report_msg.reasoning_content)
358
+ logger.info("🤯 End Report Deep Thinking")
359
+ report_content = ai_report_msg.content
360
+ report_content = re.sub(r"^```\s*markdown\s*|^\s*```|```\s*$", "", report_content, flags=re.MULTILINE)
361
+ report_content = report_content.strip()
362
+
363
+ # Add error notification to the report
364
+ if error_msg:
365
+ report_content = f"## ⚠️ Research Incomplete - Partial Results\n" \
366
+ f"**The research process was interrupted by an error:** {error_msg}\n\n" \
367
+ f"{report_content}"
368
+
369
+ report_file_path = os.path.join(save_dir, "final_report.md")
370
+ with open(report_file_path, "w", encoding="utf-8") as f:
371
+ f.write(report_content)
372
+ logger.info(f"Save Report at: {report_file_path}")
373
+ return report_content, report_file_path
374
+
375
+ except Exception as report_error:
376
+ logger.error(f"Failed to generate partial report: {report_error}")
377
+ return f"Error generating report: {str(report_error)}", None
src/utils/default_config_settings.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import uuid
4
+ import gradio as gr
5
+
6
+
7
+ def default_config():
8
+ """Prepare the default configuration"""
9
+ return {
10
+ "agent_type": "custom",
11
+ "max_steps": 100,
12
+ "max_actions_per_step": 10,
13
+ "use_vision": True,
14
+ "tool_calling_method": "auto",
15
+ "llm_provider": "openai",
16
+ "llm_model_name": "gpt-4o",
17
+ "llm_num_ctx": 32000,
18
+ "llm_temperature": 1.0,
19
+ "llm_base_url": "",
20
+ "llm_api_key": "",
21
+ "use_own_browser": os.getenv("CHROME_PERSISTENT_SESSION", "false").lower() == "true",
22
+ "keep_browser_open": False,
23
+ "headless": False,
24
+ "disable_security": True,
25
+ "enable_recording": True,
26
+ "window_w": 1280,
27
+ "window_h": 1100,
28
+ "save_recording_path": "./tmp/record_videos",
29
+ "save_trace_path": "./tmp/traces",
30
+ "save_agent_history_path": "./tmp/agent_history",
31
+ "task": "go to google.com and type 'OpenAI' click search and give me the first url",
32
+ }
33
+
34
+
35
+ def load_config_from_file(config_file):
36
+ """Load settings from a UUID.pkl file."""
37
+ try:
38
+ with open(config_file, 'rb') as f:
39
+ settings = pickle.load(f)
40
+ return settings
41
+ except Exception as e:
42
+ return f"Error loading configuration: {str(e)}"
43
+
44
+
45
+ def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
46
+ """Save the current settings to a UUID.pkl file with a UUID name."""
47
+ os.makedirs(save_dir, exist_ok=True)
48
+ config_file = os.path.join(save_dir, f"{uuid.uuid4()}.pkl")
49
+ with open(config_file, 'wb') as f:
50
+ pickle.dump(settings, f)
51
+ return f"Configuration saved to {config_file}"
52
+
53
+
54
+ def save_current_config(*args):
55
+ current_config = {
56
+ "agent_type": args[0],
57
+ "max_steps": args[1],
58
+ "max_actions_per_step": args[2],
59
+ "use_vision": args[3],
60
+ "tool_calling_method": args[4],
61
+ "llm_provider": args[5],
62
+ "llm_model_name": args[6],
63
+ "llm_num_ctx": args[7],
64
+ "llm_temperature": args[8],
65
+ "llm_base_url": args[9],
66
+ "llm_api_key": args[10],
67
+ "use_own_browser": args[11],
68
+ "keep_browser_open": args[12],
69
+ "headless": args[13],
70
+ "disable_security": args[14],
71
+ "enable_recording": args[15],
72
+ "window_w": args[16],
73
+ "window_h": args[17],
74
+ "save_recording_path": args[18],
75
+ "save_trace_path": args[19],
76
+ "save_agent_history_path": args[20],
77
+ "task": args[21],
78
+ }
79
+ return save_config_to_file(current_config)
80
+
81
+
82
+ def update_ui_from_config(config_file):
83
+ if config_file is not None:
84
+ loaded_config = load_config_from_file(config_file.name)
85
+ if isinstance(loaded_config, dict):
86
+ return (
87
+ gr.update(value=loaded_config.get("agent_type", "custom")),
88
+ gr.update(value=loaded_config.get("max_steps", 100)),
89
+ gr.update(value=loaded_config.get("max_actions_per_step", 10)),
90
+ gr.update(value=loaded_config.get("use_vision", True)),
91
+ gr.update(value=loaded_config.get("tool_calling_method", True)),
92
+ gr.update(value=loaded_config.get("llm_provider", "openai")),
93
+ gr.update(value=loaded_config.get("llm_model_name", "gpt-4o")),
94
+ gr.update(value=loaded_config.get("llm_num_ctx", 32000)),
95
+ gr.update(value=loaded_config.get("llm_temperature", 1.0)),
96
+ gr.update(value=loaded_config.get("llm_base_url", "")),
97
+ gr.update(value=loaded_config.get("llm_api_key", "")),
98
+ gr.update(value=loaded_config.get("use_own_browser", False)),
99
+ gr.update(value=loaded_config.get("keep_browser_open", False)),
100
+ gr.update(value=loaded_config.get("headless", False)),
101
+ gr.update(value=loaded_config.get("disable_security", True)),
102
+ gr.update(value=loaded_config.get("enable_recording", True)),
103
+ gr.update(value=loaded_config.get("window_w", 1280)),
104
+ gr.update(value=loaded_config.get("window_h", 1100)),
105
+ gr.update(value=loaded_config.get("save_recording_path", "./tmp/record_videos")),
106
+ gr.update(value=loaded_config.get("save_trace_path", "./tmp/traces")),
107
+ gr.update(value=loaded_config.get("save_agent_history_path", "./tmp/agent_history")),
108
+ gr.update(value=loaded_config.get("task", "")),
109
+ "Configuration loaded successfully."
110
+ )
111
+ else:
112
+ return (
113
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
114
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
115
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
116
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
117
+ gr.update(), "Error: Invalid configuration file."
118
+ )
119
+ return (
120
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
121
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
122
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
123
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
124
+ gr.update(), "No file selected."
125
+ )
src/utils/llm.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import pdb
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.globals import get_llm_cache
5
+ from langchain_core.language_models.base import (
6
+ BaseLanguageModel,
7
+ LangSmithParams,
8
+ LanguageModelInput,
9
+ )
10
+ from langchain_core.load import dumpd, dumps
11
+ from langchain_core.messages import (
12
+ AIMessage,
13
+ SystemMessage,
14
+ AnyMessage,
15
+ BaseMessage,
16
+ BaseMessageChunk,
17
+ HumanMessage,
18
+ convert_to_messages,
19
+ message_chunk_to_message,
20
+ )
21
+ from langchain_core.outputs import (
22
+ ChatGeneration,
23
+ ChatGenerationChunk,
24
+ ChatResult,
25
+ LLMResult,
26
+ RunInfo,
27
+ )
28
+ from langchain_ollama import ChatOllama
29
+ from langchain_core.output_parsers.base import OutputParserLike
30
+ from langchain_core.runnables import Runnable, RunnableConfig
31
+ from langchain_core.tools import BaseTool
32
+
33
+ from typing import (
34
+ TYPE_CHECKING,
35
+ Any,
36
+ Callable,
37
+ Literal,
38
+ Optional,
39
+ Union,
40
+ cast,
41
+ )
42
+
43
+ class DeepSeekR1ChatOpenAI(ChatOpenAI):
44
+
45
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
46
+ super().__init__(*args, **kwargs)
47
+ self.client = OpenAI(
48
+ base_url=kwargs.get("base_url"),
49
+ api_key=kwargs.get("api_key")
50
+ )
51
+
52
+ async def ainvoke(
53
+ self,
54
+ input: LanguageModelInput,
55
+ config: Optional[RunnableConfig] = None,
56
+ *,
57
+ stop: Optional[list[str]] = None,
58
+ **kwargs: Any,
59
+ ) -> AIMessage:
60
+ message_history = []
61
+ for input_ in input:
62
+ if isinstance(input_, SystemMessage):
63
+ message_history.append({"role": "system", "content": input_.content})
64
+ elif isinstance(input_, AIMessage):
65
+ message_history.append({"role": "assistant", "content": input_.content})
66
+ else:
67
+ message_history.append({"role": "user", "content": input_.content})
68
+
69
+ response = self.client.chat.completions.create(
70
+ model=self.model_name,
71
+ messages=message_history
72
+ )
73
+
74
+ reasoning_content = response.choices[0].message.reasoning_content
75
+ content = response.choices[0].message.content
76
+ return AIMessage(content=content, reasoning_content=reasoning_content)
77
+
78
+ def invoke(
79
+ self,
80
+ input: LanguageModelInput,
81
+ config: Optional[RunnableConfig] = None,
82
+ *,
83
+ stop: Optional[list[str]] = None,
84
+ **kwargs: Any,
85
+ ) -> AIMessage:
86
+ message_history = []
87
+ for input_ in input:
88
+ if isinstance(input_, SystemMessage):
89
+ message_history.append({"role": "system", "content": input_.content})
90
+ elif isinstance(input_, AIMessage):
91
+ message_history.append({"role": "assistant", "content": input_.content})
92
+ else:
93
+ message_history.append({"role": "user", "content": input_.content})
94
+
95
+ response = self.client.chat.completions.create(
96
+ model=self.model_name,
97
+ messages=message_history
98
+ )
99
+
100
+ reasoning_content = response.choices[0].message.reasoning_content
101
+ content = response.choices[0].message.content
102
+ return AIMessage(content=content, reasoning_content=reasoning_content)
103
+
104
+ class DeepSeekR1ChatOllama(ChatOllama):
105
+
106
+ async def ainvoke(
107
+ self,
108
+ input: LanguageModelInput,
109
+ config: Optional[RunnableConfig] = None,
110
+ *,
111
+ stop: Optional[list[str]] = None,
112
+ **kwargs: Any,
113
+ ) -> AIMessage:
114
+ org_ai_message = await super().ainvoke(input=input)
115
+ org_content = org_ai_message.content
116
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
117
+ content = org_content.split("</think>")[1]
118
+ if "**JSON Response:**" in content:
119
+ content = content.split("**JSON Response:**")[-1]
120
+ return AIMessage(content=content, reasoning_content=reasoning_content)
121
+
122
+ def invoke(
123
+ self,
124
+ input: LanguageModelInput,
125
+ config: Optional[RunnableConfig] = None,
126
+ *,
127
+ stop: Optional[list[str]] = None,
128
+ **kwargs: Any,
129
+ ) -> AIMessage:
130
+ org_ai_message = super().invoke(input=input)
131
+ org_content = org_ai_message.content
132
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
133
+ content = org_content.split("</think>")[1]
134
+ if "**JSON Response:**" in content:
135
+ content = content.split("**JSON Response:**")[-1]
136
+ return AIMessage(content=content, reasoning_content=reasoning_content)
src/utils/utils.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+ import requests
7
+
8
+ from langchain_anthropic import ChatAnthropic
9
+ from langchain_mistralai import ChatMistralAI
10
+ from langchain_google_genai import ChatGoogleGenerativeAI
11
+ from langchain_ollama import ChatOllama
12
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
13
+ import gradio as gr
14
+
15
+ from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama
16
+
17
+ PROVIDER_DISPLAY_NAMES = {
18
+ "openai": "OpenAI",
19
+ "azure_openai": "Azure OpenAI",
20
+ "anthropic": "Anthropic",
21
+ "deepseek": "DeepSeek",
22
+ "google": "Google",
23
+ "alibaba": "Alibaba",
24
+ "moonshot": "MoonShot"
25
+ }
26
+
27
+ def get_llm_model(provider: str, **kwargs):
28
+ """
29
+ 获取LLM 模型
30
+ :param provider: 模型类型
31
+ :param kwargs:
32
+ :return:
33
+ """
34
+ if provider not in ["ollama"]:
35
+ env_var = f"{provider.upper()}_API_KEY"
36
+ api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
37
+ if not api_key:
38
+ handle_api_key_error(provider, env_var)
39
+ kwargs["api_key"] = api_key
40
+
41
+ if provider == "anthropic":
42
+ if not kwargs.get("base_url", ""):
43
+ base_url = "https://api.anthropic.com"
44
+ else:
45
+ base_url = kwargs.get("base_url")
46
+
47
+ return ChatAnthropic(
48
+ model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
49
+ temperature=kwargs.get("temperature", 0.0),
50
+ base_url=base_url,
51
+ api_key=api_key,
52
+ )
53
+ elif provider == 'mistral':
54
+ if not kwargs.get("base_url", ""):
55
+ base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
56
+ else:
57
+ base_url = kwargs.get("base_url")
58
+ if not kwargs.get("api_key", ""):
59
+ api_key = os.getenv("MISTRAL_API_KEY", "")
60
+ else:
61
+ api_key = kwargs.get("api_key")
62
+
63
+ return ChatMistralAI(
64
+ model=kwargs.get("model_name", "mistral-large-latest"),
65
+ temperature=kwargs.get("temperature", 0.0),
66
+ base_url=base_url,
67
+ api_key=api_key,
68
+ )
69
+ elif provider == "openai":
70
+ if not kwargs.get("base_url", ""):
71
+ base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
72
+ else:
73
+ base_url = kwargs.get("base_url")
74
+
75
+ return ChatOpenAI(
76
+ model=kwargs.get("model_name", "gpt-4o"),
77
+ temperature=kwargs.get("temperature", 0.0),
78
+ base_url=base_url,
79
+ api_key=api_key,
80
+ )
81
+ elif provider == "deepseek":
82
+ if not kwargs.get("base_url", ""):
83
+ base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
84
+ else:
85
+ base_url = kwargs.get("base_url")
86
+
87
+ if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
88
+ return DeepSeekR1ChatOpenAI(
89
+ model=kwargs.get("model_name", "deepseek-reasoner"),
90
+ temperature=kwargs.get("temperature", 0.0),
91
+ base_url=base_url,
92
+ api_key=api_key,
93
+ )
94
+ else:
95
+ return ChatOpenAI(
96
+ model=kwargs.get("model_name", "deepseek-chat"),
97
+ temperature=kwargs.get("temperature", 0.0),
98
+ base_url=base_url,
99
+ api_key=api_key,
100
+ )
101
+ elif provider == "google":
102
+ return ChatGoogleGenerativeAI(
103
+ model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
104
+ temperature=kwargs.get("temperature", 0.0),
105
+ google_api_key=api_key,
106
+ )
107
+ elif provider == "ollama":
108
+ if not kwargs.get("base_url", ""):
109
+ base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
110
+ else:
111
+ base_url = kwargs.get("base_url")
112
+
113
+ if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
114
+ return DeepSeekR1ChatOllama(
115
+ model=kwargs.get("model_name", "deepseek-r1:14b"),
116
+ temperature=kwargs.get("temperature", 0.0),
117
+ num_ctx=kwargs.get("num_ctx", 32000),
118
+ base_url=base_url,
119
+ )
120
+ else:
121
+ return ChatOllama(
122
+ model=kwargs.get("model_name", "qwen2.5:7b"),
123
+ temperature=kwargs.get("temperature", 0.0),
124
+ num_ctx=kwargs.get("num_ctx", 32000),
125
+ num_predict=kwargs.get("num_predict", 1024),
126
+ base_url=base_url,
127
+ )
128
+ elif provider == "azure_openai":
129
+ if not kwargs.get("base_url", ""):
130
+ base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
131
+ else:
132
+ base_url = kwargs.get("base_url")
133
+ api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
134
+ return AzureChatOpenAI(
135
+ model=kwargs.get("model_name", "gpt-4o"),
136
+ temperature=kwargs.get("temperature", 0.0),
137
+ api_version=api_version,
138
+ azure_endpoint=base_url,
139
+ api_key=api_key,
140
+ )
141
+ elif provider == "alibaba":
142
+ if not kwargs.get("base_url", ""):
143
+ base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
144
+ else:
145
+ base_url = kwargs.get("base_url")
146
+
147
+ return ChatOpenAI(
148
+ model=kwargs.get("model_name", "qwen-plus"),
149
+ temperature=kwargs.get("temperature", 0.0),
150
+ base_url=base_url,
151
+ api_key=api_key,
152
+ )
153
+
154
+ elif provider == "moonshot":
155
+ return ChatOpenAI(
156
+ model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
157
+ temperature=kwargs.get("temperature", 0.0),
158
+ base_url=os.getenv("MOONSHOT_ENDPOINT"),
159
+ api_key=os.getenv("MOONSHOT_API_KEY"),
160
+ )
161
+ else:
162
+ raise ValueError(f"Unsupported provider: {provider}")
163
+
164
+ # Predefined model names for common providers
165
+ model_names = {
166
+ "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
167
+ "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
168
+ "deepseek": ["deepseek-chat", "deepseek-reasoner"],
169
+ "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"],
170
+ "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b", "deepseek-r1:14b", "deepseek-r1:32b"],
171
+ "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
172
+ "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
173
+ "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"],
174
+ "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
175
+ }
176
+
177
+ # Callback to update the model name dropdown based on the selected provider
178
+ def update_model_dropdown(llm_provider, api_key=None, base_url=None):
179
+ """
180
+ Update the model name dropdown with predefined models for the selected provider.
181
+ """
182
+ # Use API keys from .env if not provided
183
+ if not api_key:
184
+ api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
185
+ if not base_url:
186
+ base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
187
+
188
+ # Use predefined models for the selected provider
189
+ if llm_provider in model_names:
190
+ return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
191
+ else:
192
+ return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
193
+
194
+ def handle_api_key_error(provider: str, env_var: str):
195
+ """
196
+ Handles the missing API key error by raising a gr.Error with a clear message.
197
+ """
198
+ provider_display = PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
199
+ raise gr.Error(
200
+ f"💥 {provider_display} API key not found! 🔑 Please set the "
201
+ f"`{env_var}` environment variable or provide it in the UI."
202
+ )
203
+
204
+ def encode_image(img_path):
205
+ if not img_path:
206
+ return None
207
+ with open(img_path, "rb") as fin:
208
+ image_data = base64.b64encode(fin.read()).decode("utf-8")
209
+ return image_data
210
+
211
+
212
+ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
213
+ """Get the latest recording and trace files"""
214
+ latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
215
+
216
+ if not os.path.exists(directory):
217
+ os.makedirs(directory, exist_ok=True)
218
+ return latest_files
219
+
220
+ for file_type in file_types:
221
+ try:
222
+ matches = list(Path(directory).rglob(f"*{file_type}"))
223
+ if matches:
224
+ latest = max(matches, key=lambda p: p.stat().st_mtime)
225
+ # Only return files that are complete (not being written)
226
+ if time.time() - latest.stat().st_mtime > 1.0:
227
+ latest_files[file_type] = str(latest)
228
+ except Exception as e:
229
+ print(f"Error getting latest {file_type} file: {e}")
230
+
231
+ return latest_files
232
+ async def capture_screenshot(browser_context):
233
+ """Capture and encode a screenshot"""
234
+ # Extract the Playwright browser instance
235
+ playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
236
+
237
+ # Check if the browser instance is valid and if an existing context can be reused
238
+ if playwright_browser and playwright_browser.contexts:
239
+ playwright_context = playwright_browser.contexts[0]
240
+ else:
241
+ return None
242
+
243
+ # Access pages in the context
244
+ pages = None
245
+ if playwright_context:
246
+ pages = playwright_context.pages
247
+
248
+ # Use an existing page or create a new one if none exist
249
+ if pages:
250
+ active_page = pages[0]
251
+ for page in pages:
252
+ if page.url != "about:blank":
253
+ active_page = page
254
+ else:
255
+ return None
256
+
257
+ # Take screenshot
258
+ try:
259
+ screenshot = await active_page.screenshot(
260
+ type='jpeg',
261
+ quality=75,
262
+ scale="css"
263
+ )
264
+ encoded = base64.b64encode(screenshot).decode('utf-8')
265
+ return encoded
266
+ except Exception as e:
267
+ return None
supervisord.conf ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ user=root
3
+ nodaemon=true
4
+ logfile=/dev/stdout
5
+ logfile_maxbytes=0
6
+ loglevel=debug
7
+
8
+ [program:xvfb]
9
+ command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
10
+ autorestart=true
11
+ stdout_logfile=/dev/stdout
12
+ stdout_logfile_maxbytes=0
13
+ stderr_logfile=/dev/stderr
14
+ stderr_logfile_maxbytes=0
15
+ priority=100
16
+ startsecs=3
17
+ stopsignal=TERM
18
+ stopwaitsecs=10
19
+
20
+ [program:vnc_setup]
21
+ command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
22
+ autorestart=false
23
+ startsecs=0
24
+ priority=150
25
+ stdout_logfile=/dev/stdout
26
+ stdout_logfile_maxbytes=0
27
+ stderr_logfile=/dev/stderr
28
+ stderr_logfile_maxbytes=0
29
+
30
+ [program:x11vnc]
31
+ command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5901 -o /var/log/x11vnc.log"
32
+ autorestart=true
33
+ stdout_logfile=/dev/stdout
34
+ stdout_logfile_maxbytes=0
35
+ stderr_logfile=/dev/stderr
36
+ stderr_logfile_maxbytes=0
37
+ priority=200
38
+ startretries=10
39
+ startsecs=10
40
+ stopsignal=TERM
41
+ stopwaitsecs=10
42
+ depends_on=vnc_setup,xvfb
43
+
44
+ [program:x11vnc_log]
45
+ command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log"
46
+ autorestart=true
47
+ stdout_logfile=/dev/stdout
48
+ stdout_logfile_maxbytes=0
49
+ stderr_logfile=/dev/stderr
50
+ stderr_logfile_maxbytes=0
51
+ priority=250
52
+ stopsignal=TERM
53
+ stopwaitsecs=5
54
+ depends_on=x11vnc
55
+
56
+ [program:novnc]
57
+ command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5901 --listen 0.0.0.0:6080 --web /opt/novnc"
58
+ autorestart=true
59
+ stdout_logfile=/dev/stdout
60
+ stdout_logfile_maxbytes=0
61
+ stderr_logfile=/dev/stderr
62
+ stderr_logfile_maxbytes=0
63
+ priority=300
64
+ startretries=5
65
+ startsecs=3
66
+ depends_on=x11vnc
67
+
68
+ [program:persistent_browser]
69
+ environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
70
+ command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
71
+ autorestart=true
72
+ stdout_logfile=/dev/stdout
73
+ stdout_logfile_maxbytes=0
74
+ stderr_logfile=/dev/stderr
75
+ stderr_logfile_maxbytes=0
76
+ priority=350
77
+ startretries=5
78
+ startsecs=10
79
+ stopsignal=TERM
80
+ stopwaitsecs=15
81
+ depends_on=novnc
82
+
83
+ [program:webui]
84
+ command=python webui.py --ip 0.0.0.0 --port 7788
85
+ directory=/app
86
+ autorestart=true
87
+ stdout_logfile=/dev/stdout
88
+ stdout_logfile_maxbytes=0
89
+ stderr_logfile=/dev/stderr
90
+ stderr_logfile_maxbytes=0
91
+ priority=400
92
+ startretries=3
93
+ startsecs=3
94
+ stopsignal=TERM
95
+ stopwaitsecs=10
96
+ depends_on=persistent_browser
tests/test_browser_use.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import sys
7
+
8
+ sys.path.append(".")
9
+ import asyncio
10
+ import os
11
+ import sys
12
+ from pprint import pprint
13
+
14
+ from browser_use import Agent
15
+ from browser_use.agent.views import AgentHistoryList
16
+
17
+ from src.utils import utils
18
+
19
+
20
+ async def test_browser_use_org():
21
+ from browser_use.browser.browser import Browser, BrowserConfig
22
+ from browser_use.browser.context import (
23
+ BrowserContextConfig,
24
+ BrowserContextWindowSize,
25
+ )
26
+
27
+ # llm = utils.get_llm_model(
28
+ # provider="azure_openai",
29
+ # model_name="gpt-4o",
30
+ # temperature=0.8,
31
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
32
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
33
+ # )
34
+
35
+ # llm = utils.get_llm_model(
36
+ # provider="deepseek",
37
+ # model_name="deepseek-chat",
38
+ # temperature=0.8
39
+ # )
40
+
41
+ llm = utils.get_llm_model(
42
+ provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
43
+ )
44
+
45
+ window_w, window_h = 1920, 1080
46
+ use_vision = False
47
+ use_own_browser = False
48
+ if use_own_browser:
49
+ chrome_path = os.getenv("CHROME_PATH", None)
50
+ if chrome_path == "":
51
+ chrome_path = None
52
+ else:
53
+ chrome_path = None
54
+
55
+ tool_calling_method = "json_schema" # setting to json_schema when using ollma
56
+
57
+ browser = Browser(
58
+ config=BrowserConfig(
59
+ headless=False,
60
+ disable_security=True,
61
+ chrome_instance_path=chrome_path,
62
+ extra_chromium_args=[f"--window-size={window_w},{window_h}"],
63
+ )
64
+ )
65
+ async with await browser.new_context(
66
+ config=BrowserContextConfig(
67
+ trace_path="./tmp/traces",
68
+ save_recording_path="./tmp/record_videos",
69
+ no_viewport=False,
70
+ browser_window_size=BrowserContextWindowSize(
71
+ width=window_w, height=window_h
72
+ ),
73
+ )
74
+ ) as browser_context:
75
+ agent = Agent(
76
+ task="go to google.com and type 'OpenAI' click search and give me the first url",
77
+ llm=llm,
78
+ browser_context=browser_context,
79
+ use_vision=use_vision,
80
+ tool_calling_method=tool_calling_method
81
+ )
82
+ history: AgentHistoryList = await agent.run(max_steps=10)
83
+
84
+ print("Final Result:")
85
+ pprint(history.final_result(), indent=4)
86
+
87
+ print("\nErrors:")
88
+ pprint(history.errors(), indent=4)
89
+
90
+ # e.g. xPaths the model clicked on
91
+ print("\nModel Outputs:")
92
+ pprint(history.model_actions(), indent=4)
93
+
94
+ print("\nThoughts:")
95
+ pprint(history.model_thoughts(), indent=4)
96
+ # close browser
97
+ await browser.close()
98
+
99
+
100
+ async def test_browser_use_custom():
101
+ from browser_use.browser.context import BrowserContextWindowSize
102
+ from browser_use.browser.browser import BrowserConfig
103
+ from playwright.async_api import async_playwright
104
+
105
+ from src.agent.custom_agent import CustomAgent
106
+ from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
107
+ from src.browser.custom_browser import CustomBrowser
108
+ from src.browser.custom_context import BrowserContextConfig
109
+ from src.controller.custom_controller import CustomController
110
+
111
+ window_w, window_h = 1920, 1080
112
+
113
+ # llm = utils.get_llm_model(
114
+ # provider="openai",
115
+ # model_name="gpt-4o",
116
+ # temperature=0.8,
117
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
118
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
119
+ # )
120
+
121
+ llm = utils.get_llm_model(
122
+ provider="azure_openai",
123
+ model_name="gpt-4o",
124
+ temperature=0.8,
125
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
126
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
127
+ )
128
+
129
+ # llm = utils.get_llm_model(
130
+ # provider="google",
131
+ # model_name="gemini-2.0-flash",
132
+ # temperature=1.0,
133
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
134
+ # )
135
+
136
+ # llm = utils.get_llm_model(
137
+ # provider="deepseek",
138
+ # model_name="deepseek-reasoner",
139
+ # temperature=0.8
140
+ # )
141
+
142
+ # llm = utils.get_llm_model(
143
+ # provider="deepseek",
144
+ # model_name="deepseek-chat",
145
+ # temperature=0.8
146
+ # )
147
+
148
+ # llm = utils.get_llm_model(
149
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
150
+ # )
151
+
152
+ # llm = utils.get_llm_model(
153
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
154
+ # )
155
+
156
+ controller = CustomController()
157
+ use_own_browser = True
158
+ disable_security = True
159
+ use_vision = False # Set to False when using DeepSeek
160
+
161
+ max_actions_per_step = 1
162
+ playwright = None
163
+ browser = None
164
+ browser_context = None
165
+
166
+ try:
167
+ extra_chromium_args = [f"--window-size={window_w},{window_h}"]
168
+ if use_own_browser:
169
+ chrome_path = os.getenv("CHROME_PATH", None)
170
+ if chrome_path == "":
171
+ chrome_path = None
172
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
173
+ if chrome_user_data:
174
+ extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
175
+ else:
176
+ chrome_path = None
177
+ browser = CustomBrowser(
178
+ config=BrowserConfig(
179
+ headless=False,
180
+ disable_security=disable_security,
181
+ chrome_instance_path=chrome_path,
182
+ extra_chromium_args=extra_chromium_args,
183
+ )
184
+ )
185
+ browser_context = await browser.new_context(
186
+ config=BrowserContextConfig(
187
+ trace_path="./tmp/traces",
188
+ save_recording_path="./tmp/record_videos",
189
+ no_viewport=False,
190
+ browser_window_size=BrowserContextWindowSize(
191
+ width=window_w, height=window_h
192
+ ),
193
+ )
194
+ )
195
+ agent = CustomAgent(
196
+ task="Give me stock price of Tesla",
197
+ add_infos="", # some hints for llm to complete the task
198
+ llm=llm,
199
+ browser=browser,
200
+ browser_context=browser_context,
201
+ controller=controller,
202
+ system_prompt_class=CustomSystemPrompt,
203
+ agent_prompt_class=CustomAgentMessagePrompt,
204
+ use_vision=use_vision,
205
+ max_actions_per_step=max_actions_per_step
206
+ )
207
+ history: AgentHistoryList = await agent.run(max_steps=100)
208
+
209
+ print("Final Result:")
210
+ pprint(history.final_result(), indent=4)
211
+
212
+ print("\nErrors:")
213
+ pprint(history.errors(), indent=4)
214
+
215
+ # e.g. xPaths the model clicked on
216
+ print("\nModel Outputs:")
217
+ pprint(history.model_actions(), indent=4)
218
+
219
+ print("\nThoughts:")
220
+ pprint(history.model_thoughts(), indent=4)
221
+ # close browser
222
+ except Exception:
223
+ import traceback
224
+
225
+ traceback.print_exc()
226
+ finally:
227
+ # 显式关闭持久化上下文
228
+ if browser_context:
229
+ await browser_context.close()
230
+
231
+ # 关闭 Playwright 对象
232
+ if playwright:
233
+ await playwright.stop()
234
+ if browser:
235
+ await browser.close()
236
+
237
+ async def test_browser_use_parallel():
238
+ from browser_use.browser.context import BrowserContextWindowSize
239
+ from browser_use.browser.browser import BrowserConfig
240
+ from playwright.async_api import async_playwright
241
+ from browser_use.browser.browser import Browser
242
+ from src.agent.custom_agent import CustomAgent
243
+ from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
244
+ from src.browser.custom_browser import CustomBrowser
245
+ from src.browser.custom_context import BrowserContextConfig
246
+ from src.controller.custom_controller import CustomController
247
+
248
+ window_w, window_h = 1920, 1080
249
+
250
+ # llm = utils.get_llm_model(
251
+ # provider="openai",
252
+ # model_name="gpt-4o",
253
+ # temperature=0.8,
254
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
255
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
256
+ # )
257
+
258
+ # llm = utils.get_llm_model(
259
+ # provider="azure_openai",
260
+ # model_name="gpt-4o",
261
+ # temperature=0.8,
262
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
263
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
264
+ # )
265
+
266
+ llm = utils.get_llm_model(
267
+ provider="gemini",
268
+ model_name="gemini-2.0-flash-exp",
269
+ temperature=1.0,
270
+ api_key=os.getenv("GOOGLE_API_KEY", "")
271
+ )
272
+
273
+ # llm = utils.get_llm_model(
274
+ # provider="deepseek",
275
+ # model_name="deepseek-reasoner",
276
+ # temperature=0.8
277
+ # )
278
+
279
+ # llm = utils.get_llm_model(
280
+ # provider="deepseek",
281
+ # model_name="deepseek-chat",
282
+ # temperature=0.8
283
+ # )
284
+
285
+ # llm = utils.get_llm_model(
286
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
287
+ # )
288
+
289
+ # llm = utils.get_llm_model(
290
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
291
+ # )
292
+
293
+ controller = CustomController()
294
+ use_own_browser = True
295
+ disable_security = True
296
+ use_vision = True # Set to False when using DeepSeek
297
+
298
+ max_actions_per_step = 1
299
+ playwright = None
300
+ browser = None
301
+ browser_context = None
302
+
303
+ browser = Browser(
304
+ config=BrowserConfig(
305
+ disable_security=True,
306
+ headless=False,
307
+ new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
308
+ )
309
+ )
310
+
311
+ try:
312
+ agents = [
313
+ Agent(task=task, llm=llm, browser=browser)
314
+ for task in [
315
+ 'Search Google for weather in Tokyo',
316
+ 'Check Reddit front page title',
317
+ '大S去世',
318
+ 'Find NASA image of the day',
319
+ # 'Check top story on CNN',
320
+ # 'Search latest SpaceX launch date',
321
+ # 'Look up population of Paris',
322
+ # 'Find current time in Sydney',
323
+ # 'Check who won last Super Bowl',
324
+ # 'Search trending topics on Twitter',
325
+ ]
326
+ ]
327
+
328
+ history = await asyncio.gather(*[agent.run() for agent in agents])
329
+ pdb.set_trace()
330
+ print("Final Result:")
331
+ pprint(history.final_result(), indent=4)
332
+
333
+ print("\nErrors:")
334
+ pprint(history.errors(), indent=4)
335
+
336
+ # e.g. xPaths the model clicked on
337
+ print("\nModel Outputs:")
338
+ pprint(history.model_actions(), indent=4)
339
+
340
+ print("\nThoughts:")
341
+ pprint(history.model_thoughts(), indent=4)
342
+ # close browser
343
+ except Exception:
344
+ import traceback
345
+
346
+ traceback.print_exc()
347
+ finally:
348
+ # 显式关闭持久化上下文
349
+ if browser_context:
350
+ await browser_context.close()
351
+
352
+ # 关闭 Playwright 对象
353
+ if playwright:
354
+ await playwright.stop()
355
+ if browser:
356
+ await browser.close()
357
+
358
+ if __name__ == "__main__":
359
+ # asyncio.run(test_browser_use_org())
360
+ # asyncio.run(test_browser_use_parallel())
361
+ asyncio.run(test_browser_use_custom())
tests/test_deep_research.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import sys
7
+
8
+ sys.path.append(".")
9
+
10
+ async def test_deep_research():
11
+ from src.utils.deep_research import deep_research
12
+ from src.utils import utils
13
+
14
+ task = "write a report about DeepSeek-R1, get its pdf"
15
+ llm = utils.get_llm_model(
16
+ provider="gemini",
17
+ model_name="gemini-2.0-flash-thinking-exp-01-21",
18
+ temperature=1.0,
19
+ api_key=os.getenv("GOOGLE_API_KEY", "")
20
+ )
21
+
22
+ report_content, report_file_path = await deep_research(task=task, llm=llm, agent_state=None,
23
+ max_search_iterations=1,
24
+ max_query_num=3,
25
+ use_own_browser=False)
26
+
27
+
28
+
29
+ if __name__ == "__main__":
30
+ asyncio.run(test_deep_research())
tests/test_llm_api.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdb
3
+ from dataclasses import dataclass
4
+
5
+ from dotenv import load_dotenv
6
+ from langchain_core.messages import HumanMessage, SystemMessage
7
+ from langchain_ollama import ChatOllama
8
+
9
+ load_dotenv()
10
+
11
+ import sys
12
+
13
+ sys.path.append(".")
14
+
15
+ @dataclass
16
+ class LLMConfig:
17
+ provider: str
18
+ model_name: str
19
+ temperature: float = 0.8
20
+ base_url: str = None
21
+ api_key: str = None
22
+
23
+ def create_message_content(text, image_path=None):
24
+ content = [{"type": "text", "text": text}]
25
+ image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
26
+ if image_path:
27
+ from src.utils import utils
28
+ image_data = utils.encode_image(image_path)
29
+ content.append({
30
+ "type": "image_url",
31
+ "image_url": {"url": f"data:image/{image_format};base64,{image_data}"}
32
+ })
33
+ return content
34
+
35
+ def get_env_value(key, provider):
36
+ env_mappings = {
37
+ "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
38
+ "azure_openai": {"api_key": "AZURE_OPENAI_API_KEY", "base_url": "AZURE_OPENAI_ENDPOINT"},
39
+ "google": {"api_key": "GOOGLE_API_KEY"},
40
+ "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
41
+ "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
42
+ "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
43
+ "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
44
+ }
45
+
46
+ if provider in env_mappings and key in env_mappings[provider]:
47
+ return os.getenv(env_mappings[provider][key], "")
48
+ return ""
49
+
50
+ def test_llm(config, query, image_path=None, system_message=None):
51
+ from src.utils import utils
52
+
53
+ # Special handling for Ollama-based models
54
+ if config.provider == "ollama":
55
+ if "deepseek-r1" in config.model_name:
56
+ from src.utils.llm import DeepSeekR1ChatOllama
57
+ llm = DeepSeekR1ChatOllama(model=config.model_name)
58
+ else:
59
+ llm = ChatOllama(model=config.model_name)
60
+
61
+ ai_msg = llm.invoke(query)
62
+ print(ai_msg.content)
63
+ if "deepseek-r1" in config.model_name:
64
+ pdb.set_trace()
65
+ return
66
+
67
+ # For other providers, use the standard configuration
68
+ llm = utils.get_llm_model(
69
+ provider=config.provider,
70
+ model_name=config.model_name,
71
+ temperature=config.temperature,
72
+ base_url=config.base_url or get_env_value("base_url", config.provider),
73
+ api_key=config.api_key or get_env_value("api_key", config.provider)
74
+ )
75
+
76
+ # Prepare messages for non-Ollama models
77
+ messages = []
78
+ if system_message:
79
+ messages.append(SystemMessage(content=create_message_content(system_message)))
80
+ messages.append(HumanMessage(content=create_message_content(query, image_path)))
81
+ ai_msg = llm.invoke(messages)
82
+
83
+ # Handle different response types
84
+ if hasattr(ai_msg, "reasoning_content"):
85
+ print(ai_msg.reasoning_content)
86
+ print(ai_msg.content)
87
+
88
+ if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name:
89
+ print(llm.model_name)
90
+ pdb.set_trace()
91
+
92
+ def test_openai_model():
93
+ config = LLMConfig(provider="openai", model_name="gpt-4o")
94
+ test_llm(config, "Describe this image", "assets/examples/test.png")
95
+
96
+ def test_google_model():
97
+ # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
98
+ config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
99
+ test_llm(config, "Describe this image", "assets/examples/test.png")
100
+
101
+ def test_azure_openai_model():
102
+ config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
103
+ test_llm(config, "Describe this image", "assets/examples/test.png")
104
+
105
+ def test_deepseek_model():
106
+ config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
107
+ test_llm(config, "Who are you?")
108
+
109
+ def test_deepseek_r1_model():
110
+ config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
111
+ test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
112
+
113
+ def test_ollama_model():
114
+ config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
115
+ test_llm(config, "Sing a ballad of LangChain.")
116
+
117
+ def test_deepseek_r1_ollama_model():
118
+ config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
119
+ test_llm(config, "How many 'r's are in the word 'strawberry'?")
120
+
121
+ def test_mistral_model():
122
+ config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
123
+ test_llm(config, "Describe this image", "assets/examples/test.png")
124
+
125
+ def test_moonshot_model():
126
+ config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
127
+ test_llm(config, "Describe this image", "assets/examples/test.png")
128
+
129
+ if __name__ == "__main__":
130
+ # test_openai_model()
131
+ # test_google_model()
132
+ # test_azure_openai_model()
133
+ #test_deepseek_model()
134
+ # test_ollama_model()
135
+ test_deepseek_r1_model()
136
+ # test_deepseek_r1_ollama_model()
137
+ # test_mistral_model()
tests/test_playwright.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+
7
+ def test_connect_browser():
8
+ import os
9
+ from playwright.sync_api import sync_playwright
10
+
11
+ chrome_exe = os.getenv("CHROME_PATH", "")
12
+ chrome_use_data = os.getenv("CHROME_USER_DATA", "")
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch_persistent_context(
16
+ user_data_dir=chrome_use_data,
17
+ executable_path=chrome_exe,
18
+ headless=False # Keep browser window visible
19
+ )
20
+
21
+ page = browser.new_page()
22
+ page.goto("https://mail.google.com/mail/u/0/#inbox")
23
+ page.wait_for_load_state()
24
+
25
+ input("Press the Enter key to close the browser...")
26
+
27
+ browser.close()
28
+
29
+
30
+ if __name__ == '__main__':
31
+ test_connect_browser()
webui.py ADDED
@@ -0,0 +1,1073 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ import logging
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+ import os
8
+ import glob
9
+ import asyncio
10
+ import argparse
11
+ import os
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ import gradio as gr
16
+
17
+ from browser_use.agent.service import Agent
18
+ from playwright.async_api import async_playwright
19
+ from browser_use.browser.browser import Browser, BrowserConfig
20
+ from browser_use.browser.context import (
21
+ BrowserContextConfig,
22
+ BrowserContextWindowSize,
23
+ )
24
+ from langchain_ollama import ChatOllama
25
+ from playwright.async_api import async_playwright
26
+ from src.utils.agent_state import AgentState
27
+
28
+ from src.utils import utils
29
+ from src.agent.custom_agent import CustomAgent
30
+ from src.browser.custom_browser import CustomBrowser
31
+ from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
32
+ from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext
33
+ from src.controller.custom_controller import CustomController
34
+ from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base
35
+ from src.utils.default_config_settings import default_config, load_config_from_file, save_config_to_file, save_current_config, update_ui_from_config
36
+ from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot
37
+
38
+
39
+ # Global variables for persistence
40
+ _global_browser = None
41
+ _global_browser_context = None
42
+ _global_agent = None
43
+
44
+ # Create the global agent state instance
45
+ _global_agent_state = AgentState()
46
+
47
+ async def stop_agent():
48
+ """Request the agent to stop and update UI with enhanced feedback"""
49
+ global _global_agent_state, _global_browser_context, _global_browser, _global_agent
50
+
51
+ try:
52
+ # Request stop
53
+ _global_agent.stop()
54
+
55
+ # Update UI immediately
56
+ message = "Stop requested - the agent will halt at the next safe point"
57
+ logger.info(f"🛑 {message}")
58
+
59
+ # Return UI updates
60
+ return (
61
+ message, # errors_output
62
+ gr.update(value="Stopping...", interactive=False), # stop_button
63
+ gr.update(interactive=False), # run_button
64
+ )
65
+ except Exception as e:
66
+ error_msg = f"Error during stop: {str(e)}"
67
+ logger.error(error_msg)
68
+ return (
69
+ error_msg,
70
+ gr.update(value="Stop", interactive=True),
71
+ gr.update(interactive=True)
72
+ )
73
+
74
+ async def stop_research_agent():
75
+ """Request the agent to stop and update UI with enhanced feedback"""
76
+ global _global_agent_state, _global_browser_context, _global_browser
77
+
78
+ try:
79
+ # Request stop
80
+ _global_agent_state.request_stop()
81
+
82
+ # Update UI immediately
83
+ message = "Stop requested - the agent will halt at the next safe point"
84
+ logger.info(f"🛑 {message}")
85
+
86
+ # Return UI updates
87
+ return ( # errors_output
88
+ gr.update(value="Stopping...", interactive=False), # stop_button
89
+ gr.update(interactive=False), # run_button
90
+ )
91
+ except Exception as e:
92
+ error_msg = f"Error during stop: {str(e)}"
93
+ logger.error(error_msg)
94
+ return (
95
+ gr.update(value="Stop", interactive=True),
96
+ gr.update(interactive=True)
97
+ )
98
+
99
+ async def run_browser_agent(
100
+ agent_type,
101
+ llm_provider,
102
+ llm_model_name,
103
+ llm_num_ctx,
104
+ llm_temperature,
105
+ llm_base_url,
106
+ llm_api_key,
107
+ use_own_browser,
108
+ keep_browser_open,
109
+ headless,
110
+ disable_security,
111
+ window_w,
112
+ window_h,
113
+ save_recording_path,
114
+ save_agent_history_path,
115
+ save_trace_path,
116
+ enable_recording,
117
+ task,
118
+ add_infos,
119
+ max_steps,
120
+ use_vision,
121
+ max_actions_per_step,
122
+ tool_calling_method
123
+ ):
124
+ global _global_agent_state
125
+ _global_agent_state.clear_stop() # Clear any previous stop requests
126
+
127
+ try:
128
+ # Disable recording if the checkbox is unchecked
129
+ if not enable_recording:
130
+ save_recording_path = None
131
+
132
+ # Ensure the recording directory exists if recording is enabled
133
+ if save_recording_path:
134
+ os.makedirs(save_recording_path, exist_ok=True)
135
+
136
+ # Get the list of existing videos before the agent runs
137
+ existing_videos = set()
138
+ if save_recording_path:
139
+ existing_videos = set(
140
+ glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
141
+ + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
142
+ )
143
+
144
+ # Run the agent
145
+ llm = utils.get_llm_model(
146
+ provider=llm_provider,
147
+ model_name=llm_model_name,
148
+ num_ctx=llm_num_ctx,
149
+ temperature=llm_temperature,
150
+ base_url=llm_base_url,
151
+ api_key=llm_api_key,
152
+ )
153
+ if agent_type == "org":
154
+ final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent(
155
+ llm=llm,
156
+ use_own_browser=use_own_browser,
157
+ keep_browser_open=keep_browser_open,
158
+ headless=headless,
159
+ disable_security=disable_security,
160
+ window_w=window_w,
161
+ window_h=window_h,
162
+ save_recording_path=save_recording_path,
163
+ save_agent_history_path=save_agent_history_path,
164
+ save_trace_path=save_trace_path,
165
+ task=task,
166
+ max_steps=max_steps,
167
+ use_vision=use_vision,
168
+ max_actions_per_step=max_actions_per_step,
169
+ tool_calling_method=tool_calling_method
170
+ )
171
+ elif agent_type == "custom":
172
+ final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent(
173
+ llm=llm,
174
+ use_own_browser=use_own_browser,
175
+ keep_browser_open=keep_browser_open,
176
+ headless=headless,
177
+ disable_security=disable_security,
178
+ window_w=window_w,
179
+ window_h=window_h,
180
+ save_recording_path=save_recording_path,
181
+ save_agent_history_path=save_agent_history_path,
182
+ save_trace_path=save_trace_path,
183
+ task=task,
184
+ add_infos=add_infos,
185
+ max_steps=max_steps,
186
+ use_vision=use_vision,
187
+ max_actions_per_step=max_actions_per_step,
188
+ tool_calling_method=tool_calling_method
189
+ )
190
+ else:
191
+ raise ValueError(f"Invalid agent type: {agent_type}")
192
+
193
+ # Get the list of videos after the agent runs (if recording is enabled)
194
+ latest_video = None
195
+ if save_recording_path:
196
+ new_videos = set(
197
+ glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
198
+ + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
199
+ )
200
+ if new_videos - existing_videos:
201
+ latest_video = list(new_videos - existing_videos)[0] # Get the first new video
202
+
203
+ return (
204
+ final_result,
205
+ errors,
206
+ model_actions,
207
+ model_thoughts,
208
+ latest_video,
209
+ trace_file,
210
+ history_file,
211
+ gr.update(value="Stop", interactive=True), # Re-enable stop button
212
+ gr.update(interactive=True) # Re-enable run button
213
+ )
214
+
215
+ except gr.Error:
216
+ raise
217
+
218
+ except Exception as e:
219
+ import traceback
220
+ traceback.print_exc()
221
+ errors = str(e) + "\n" + traceback.format_exc()
222
+ return (
223
+ '', # final_result
224
+ errors, # errors
225
+ '', # model_actions
226
+ '', # model_thoughts
227
+ None, # latest_video
228
+ None, # history_file
229
+ None, # trace_file
230
+ gr.update(value="Stop", interactive=True), # Re-enable stop button
231
+ gr.update(interactive=True) # Re-enable run button
232
+ )
233
+
234
+
235
+ async def run_org_agent(
236
+ llm,
237
+ use_own_browser,
238
+ keep_browser_open,
239
+ headless,
240
+ disable_security,
241
+ window_w,
242
+ window_h,
243
+ save_recording_path,
244
+ save_agent_history_path,
245
+ save_trace_path,
246
+ task,
247
+ max_steps,
248
+ use_vision,
249
+ max_actions_per_step,
250
+ tool_calling_method
251
+ ):
252
+ try:
253
+ global _global_browser, _global_browser_context, _global_agent_state, _global_agent
254
+
255
+ # Clear any previous stop request
256
+ _global_agent_state.clear_stop()
257
+
258
+ extra_chromium_args = [f"--window-size={window_w},{window_h}"]
259
+ if use_own_browser:
260
+ chrome_path = os.getenv("CHROME_PATH", None)
261
+ if chrome_path == "":
262
+ chrome_path = None
263
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
264
+ if chrome_user_data:
265
+ extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
266
+ else:
267
+ chrome_path = None
268
+
269
+ if _global_browser is None:
270
+ _global_browser = Browser(
271
+ config=BrowserConfig(
272
+ headless=headless,
273
+ disable_security=disable_security,
274
+ chrome_instance_path=chrome_path,
275
+ extra_chromium_args=extra_chromium_args,
276
+ )
277
+ )
278
+
279
+ if _global_browser_context is None:
280
+ _global_browser_context = await _global_browser.new_context(
281
+ config=BrowserContextConfig(
282
+ trace_path=save_trace_path if save_trace_path else None,
283
+ save_recording_path=save_recording_path if save_recording_path else None,
284
+ no_viewport=False,
285
+ browser_window_size=BrowserContextWindowSize(
286
+ width=window_w, height=window_h
287
+ ),
288
+ )
289
+ )
290
+
291
+ if _global_agent is None:
292
+ _global_agent = Agent(
293
+ task=task,
294
+ llm=llm,
295
+ use_vision=use_vision,
296
+ browser=_global_browser,
297
+ browser_context=_global_browser_context,
298
+ max_actions_per_step=max_actions_per_step,
299
+ tool_calling_method=tool_calling_method
300
+ )
301
+ history = await _global_agent.run(max_steps=max_steps)
302
+
303
+ history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
304
+ _global_agent.save_history(history_file)
305
+
306
+ final_result = history.final_result()
307
+ errors = history.errors()
308
+ model_actions = history.model_actions()
309
+ model_thoughts = history.model_thoughts()
310
+
311
+ trace_file = get_latest_files(save_trace_path)
312
+
313
+ return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
314
+ except Exception as e:
315
+ import traceback
316
+ traceback.print_exc()
317
+ errors = str(e) + "\n" + traceback.format_exc()
318
+ return '', errors, '', '', None, None
319
+ finally:
320
+ _global_agent = None
321
+ # Handle cleanup based on persistence configuration
322
+ if not keep_browser_open:
323
+ if _global_browser_context:
324
+ await _global_browser_context.close()
325
+ _global_browser_context = None
326
+
327
+ if _global_browser:
328
+ await _global_browser.close()
329
+ _global_browser = None
330
+
331
+ async def run_custom_agent(
332
+ llm,
333
+ use_own_browser,
334
+ keep_browser_open,
335
+ headless,
336
+ disable_security,
337
+ window_w,
338
+ window_h,
339
+ save_recording_path,
340
+ save_agent_history_path,
341
+ save_trace_path,
342
+ task,
343
+ add_infos,
344
+ max_steps,
345
+ use_vision,
346
+ max_actions_per_step,
347
+ tool_calling_method
348
+ ):
349
+ try:
350
+ global _global_browser, _global_browser_context, _global_agent_state, _global_agent
351
+
352
+ # Clear any previous stop request
353
+ _global_agent_state.clear_stop()
354
+
355
+ extra_chromium_args = [f"--window-size={window_w},{window_h}"]
356
+ if use_own_browser:
357
+ chrome_path = os.getenv("CHROME_PATH", None)
358
+ if chrome_path == "":
359
+ chrome_path = None
360
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
361
+ if chrome_user_data:
362
+ extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
363
+ else:
364
+ chrome_path = None
365
+
366
+ controller = CustomController()
367
+
368
+ # Initialize global browser if needed
369
+ if _global_browser is None:
370
+ _global_browser = CustomBrowser(
371
+ config=BrowserConfig(
372
+ headless=headless,
373
+ disable_security=disable_security,
374
+ chrome_instance_path=chrome_path,
375
+ extra_chromium_args=extra_chromium_args,
376
+ )
377
+ )
378
+
379
+ if _global_browser_context is None:
380
+ _global_browser_context = await _global_browser.new_context(
381
+ config=BrowserContextConfig(
382
+ trace_path=save_trace_path if save_trace_path else None,
383
+ save_recording_path=save_recording_path if save_recording_path else None,
384
+ no_viewport=False,
385
+ browser_window_size=BrowserContextWindowSize(
386
+ width=window_w, height=window_h
387
+ ),
388
+ )
389
+ )
390
+
391
+ # Create and run agent
392
+ if _global_agent is None:
393
+ _global_agent = CustomAgent(
394
+ task=task,
395
+ add_infos=add_infos,
396
+ use_vision=use_vision,
397
+ llm=llm,
398
+ browser=_global_browser,
399
+ browser_context=_global_browser_context,
400
+ controller=controller,
401
+ system_prompt_class=CustomSystemPrompt,
402
+ agent_prompt_class=CustomAgentMessagePrompt,
403
+ max_actions_per_step=max_actions_per_step,
404
+ tool_calling_method=tool_calling_method
405
+ )
406
+ history = await _global_agent.run(max_steps=max_steps)
407
+
408
+ history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
409
+ _global_agent.save_history(history_file)
410
+
411
+ final_result = history.final_result()
412
+ errors = history.errors()
413
+ model_actions = history.model_actions()
414
+ model_thoughts = history.model_thoughts()
415
+
416
+ trace_file = get_latest_files(save_trace_path)
417
+
418
+ return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
419
+ except Exception as e:
420
+ import traceback
421
+ traceback.print_exc()
422
+ errors = str(e) + "\n" + traceback.format_exc()
423
+ return '', errors, '', '', None, None
424
+ finally:
425
+ _global_agent = None
426
+ # Handle cleanup based on persistence configuration
427
+ if not keep_browser_open:
428
+ if _global_browser_context:
429
+ await _global_browser_context.close()
430
+ _global_browser_context = None
431
+
432
+ if _global_browser:
433
+ await _global_browser.close()
434
+ _global_browser = None
435
+
436
+ async def run_with_stream(
437
+ agent_type,
438
+ llm_provider,
439
+ llm_model_name,
440
+ llm_num_ctx,
441
+ llm_temperature,
442
+ llm_base_url,
443
+ llm_api_key,
444
+ use_own_browser,
445
+ keep_browser_open,
446
+ headless,
447
+ disable_security,
448
+ window_w,
449
+ window_h,
450
+ save_recording_path,
451
+ save_agent_history_path,
452
+ save_trace_path,
453
+ enable_recording,
454
+ task,
455
+ add_infos,
456
+ max_steps,
457
+ use_vision,
458
+ max_actions_per_step,
459
+ tool_calling_method
460
+ ):
461
+ global _global_agent_state
462
+ stream_vw = 80
463
+ stream_vh = int(80 * window_h // window_w)
464
+ if not headless:
465
+ result = await run_browser_agent(
466
+ agent_type=agent_type,
467
+ llm_provider=llm_provider,
468
+ llm_model_name=llm_model_name,
469
+ llm_num_ctx=llm_num_ctx,
470
+ llm_temperature=llm_temperature,
471
+ llm_base_url=llm_base_url,
472
+ llm_api_key=llm_api_key,
473
+ use_own_browser=use_own_browser,
474
+ keep_browser_open=keep_browser_open,
475
+ headless=headless,
476
+ disable_security=disable_security,
477
+ window_w=window_w,
478
+ window_h=window_h,
479
+ save_recording_path=save_recording_path,
480
+ save_agent_history_path=save_agent_history_path,
481
+ save_trace_path=save_trace_path,
482
+ enable_recording=enable_recording,
483
+ task=task,
484
+ add_infos=add_infos,
485
+ max_steps=max_steps,
486
+ use_vision=use_vision,
487
+ max_actions_per_step=max_actions_per_step,
488
+ tool_calling_method=tool_calling_method
489
+ )
490
+ # Add HTML content at the start of the result array
491
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Using browser...</h1>"
492
+ yield [html_content] + list(result)
493
+ else:
494
+ try:
495
+ _global_agent_state.clear_stop()
496
+ # Run the browser agent in the background
497
+ agent_task = asyncio.create_task(
498
+ run_browser_agent(
499
+ agent_type=agent_type,
500
+ llm_provider=llm_provider,
501
+ llm_model_name=llm_model_name,
502
+ llm_num_ctx=llm_num_ctx,
503
+ llm_temperature=llm_temperature,
504
+ llm_base_url=llm_base_url,
505
+ llm_api_key=llm_api_key,
506
+ use_own_browser=use_own_browser,
507
+ keep_browser_open=keep_browser_open,
508
+ headless=headless,
509
+ disable_security=disable_security,
510
+ window_w=window_w,
511
+ window_h=window_h,
512
+ save_recording_path=save_recording_path,
513
+ save_agent_history_path=save_agent_history_path,
514
+ save_trace_path=save_trace_path,
515
+ enable_recording=enable_recording,
516
+ task=task,
517
+ add_infos=add_infos,
518
+ max_steps=max_steps,
519
+ use_vision=use_vision,
520
+ max_actions_per_step=max_actions_per_step,
521
+ tool_calling_method=tool_calling_method
522
+ )
523
+ )
524
+
525
+ # Initialize values for streaming
526
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Using browser...</h1>"
527
+ final_result = errors = model_actions = model_thoughts = ""
528
+ latest_videos = trace = history_file = None
529
+
530
+
531
+ # Periodically update the stream while the agent task is running
532
+ while not agent_task.done():
533
+ try:
534
+ encoded_screenshot = await capture_screenshot(_global_browser_context)
535
+ if encoded_screenshot is not None:
536
+ html_content = f'<img src="data:image/jpeg;base64,{encoded_screenshot}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
537
+ else:
538
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
539
+ except Exception as e:
540
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
541
+
542
+ if _global_agent_state and _global_agent_state.is_stop_requested():
543
+ yield [
544
+ html_content,
545
+ final_result,
546
+ errors,
547
+ model_actions,
548
+ model_thoughts,
549
+ latest_videos,
550
+ trace,
551
+ history_file,
552
+ gr.update(value="Stopping...", interactive=False), # stop_button
553
+ gr.update(interactive=False), # run_button
554
+ ]
555
+ break
556
+ else:
557
+ yield [
558
+ html_content,
559
+ final_result,
560
+ errors,
561
+ model_actions,
562
+ model_thoughts,
563
+ latest_videos,
564
+ trace,
565
+ history_file,
566
+ gr.update(value="Stop", interactive=True), # Re-enable stop button
567
+ gr.update(interactive=True) # Re-enable run button
568
+ ]
569
+ await asyncio.sleep(0.05)
570
+
571
+ # Once the agent task completes, get the results
572
+ try:
573
+ result = await agent_task
574
+ final_result, errors, model_actions, model_thoughts, latest_videos, trace, history_file, stop_button, run_button = result
575
+ except gr.Error:
576
+ final_result = ""
577
+ model_actions = ""
578
+ model_thoughts = ""
579
+ latest_videos = trace = history_file = None
580
+
581
+ except Exception as e:
582
+ errors = f"Agent error: {str(e)}"
583
+
584
+ yield [
585
+ html_content,
586
+ final_result,
587
+ errors,
588
+ model_actions,
589
+ model_thoughts,
590
+ latest_videos,
591
+ trace,
592
+ history_file,
593
+ stop_button,
594
+ run_button
595
+ ]
596
+
597
+ except Exception as e:
598
+ import traceback
599
+ yield [
600
+ f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>",
601
+ "",
602
+ f"Error: {str(e)}\n{traceback.format_exc()}",
603
+ "",
604
+ "",
605
+ None,
606
+ None,
607
+ None,
608
+ gr.update(value="Stop", interactive=True), # Re-enable stop button
609
+ gr.update(interactive=True) # Re-enable run button
610
+ ]
611
+
612
+ # Define the theme map globally
613
+ theme_map = {
614
+ "Default": Default(),
615
+ "Soft": Soft(),
616
+ "Monochrome": Monochrome(),
617
+ "Glass": Glass(),
618
+ "Origin": Origin(),
619
+ "Citrus": Citrus(),
620
+ "Ocean": Ocean(),
621
+ "Base": Base()
622
+ }
623
+
624
+ async def close_global_browser():
625
+ global _global_browser, _global_browser_context
626
+
627
+ if _global_browser_context:
628
+ await _global_browser_context.close()
629
+ _global_browser_context = None
630
+
631
+ if _global_browser:
632
+ await _global_browser.close()
633
+ _global_browser = None
634
+
635
+ async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless):
636
+ from src.utils.deep_research import deep_research
637
+ global _global_agent_state
638
+
639
+ # Clear any previous stop request
640
+ _global_agent_state.clear_stop()
641
+
642
+ llm = utils.get_llm_model(
643
+ provider=llm_provider,
644
+ model_name=llm_model_name,
645
+ num_ctx=llm_num_ctx,
646
+ temperature=llm_temperature,
647
+ base_url=llm_base_url,
648
+ api_key=llm_api_key,
649
+ )
650
+ markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state,
651
+ max_search_iterations=max_search_iteration_input,
652
+ max_query_num=max_query_per_iter_input,
653
+ use_vision=use_vision,
654
+ headless=headless,
655
+ use_own_browser=use_own_browser
656
+ )
657
+
658
+ return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True)
659
+
660
+
661
+ def create_ui(config, theme_name="Ocean"):
662
+ css = """
663
+ .gradio-container {
664
+ max-width: 1200px !important;
665
+ margin: auto !important;
666
+ padding-top: 20px !important;
667
+ }
668
+ .header-text {
669
+ text-align: center;
670
+ margin-bottom: 30px;
671
+ }
672
+ .theme-section {
673
+ margin-bottom: 20px;
674
+ padding: 15px;
675
+ border-radius: 10px;
676
+ }
677
+ """
678
+
679
+ with gr.Blocks(
680
+ title="Browser Use WebUI", theme=theme_map[theme_name], css=css
681
+ ) as demo:
682
+ with gr.Row():
683
+ gr.Markdown(
684
+ """
685
+ # 🌐 Browser Use WebUI
686
+ ### Control your browser with AI assistance
687
+ """,
688
+ elem_classes=["header-text"],
689
+ )
690
+
691
+ with gr.Tabs() as tabs:
692
+ with gr.TabItem("⚙️ Agent Settings", id=1):
693
+ with gr.Group():
694
+ agent_type = gr.Radio(
695
+ ["org", "custom"],
696
+ label="Agent Type",
697
+ value=config['agent_type'],
698
+ info="Select the type of agent to use",
699
+ )
700
+ with gr.Column():
701
+ max_steps = gr.Slider(
702
+ minimum=1,
703
+ maximum=200,
704
+ value=config['max_steps'],
705
+ step=1,
706
+ label="Max Run Steps",
707
+ info="Maximum number of steps the agent will take",
708
+ )
709
+ max_actions_per_step = gr.Slider(
710
+ minimum=1,
711
+ maximum=20,
712
+ value=config['max_actions_per_step'],
713
+ step=1,
714
+ label="Max Actions per Step",
715
+ info="Maximum number of actions the agent will take per step",
716
+ )
717
+ with gr.Column():
718
+ use_vision = gr.Checkbox(
719
+ label="Use Vision",
720
+ value=config['use_vision'],
721
+ info="Enable visual processing capabilities",
722
+ )
723
+ tool_calling_method = gr.Dropdown(
724
+ label="Tool Calling Method",
725
+ value=config['tool_calling_method'],
726
+ interactive=True,
727
+ allow_custom_value=True, # Allow users to input custom model names
728
+ choices=["auto", "json_schema", "function_calling"],
729
+ info="Tool Calls Funtion Name",
730
+ visible=False
731
+ )
732
+
733
+ with gr.TabItem("🔧 LLM Configuration", id=2):
734
+ with gr.Group():
735
+ llm_provider = gr.Dropdown(
736
+ choices=[provider for provider,model in utils.model_names.items()],
737
+ label="LLM Provider",
738
+ value=config['llm_provider'],
739
+ info="Select your preferred language model provider"
740
+ )
741
+ llm_model_name = gr.Dropdown(
742
+ label="Model Name",
743
+ choices=utils.model_names['openai'],
744
+ value=config['llm_model_name'],
745
+ interactive=True,
746
+ allow_custom_value=True, # Allow users to input custom model names
747
+ info="Select a model from the dropdown or type a custom model name"
748
+ )
749
+ llm_num_ctx = gr.Slider(
750
+ minimum=2**8,
751
+ maximum=2**16,
752
+ value=config['llm_num_ctx'],
753
+ step=1,
754
+ label="Max Context Length",
755
+ info="Controls max context length model needs to handle (less = faster)",
756
+ visible=config['llm_provider'] == "ollama"
757
+ )
758
+ llm_temperature = gr.Slider(
759
+ minimum=0.0,
760
+ maximum=2.0,
761
+ value=config['llm_temperature'],
762
+ step=0.1,
763
+ label="Temperature",
764
+ info="Controls randomness in model outputs"
765
+ )
766
+ with gr.Row():
767
+ llm_base_url = gr.Textbox(
768
+ label="Base URL",
769
+ value=config['llm_base_url'],
770
+ info="API endpoint URL (if required)"
771
+ )
772
+ llm_api_key = gr.Textbox(
773
+ label="API Key",
774
+ type="password",
775
+ value=config['llm_api_key'],
776
+ info="Your API key (leave blank to use .env)"
777
+ )
778
+
779
+ # Change event to update context length slider
780
+ def update_llm_num_ctx_visibility(llm_provider):
781
+ return gr.update(visible=llm_provider == "ollama")
782
+
783
+ # Bind the change event of llm_provider to update the visibility of context length slider
784
+ llm_provider.change(
785
+ fn=update_llm_num_ctx_visibility,
786
+ inputs=llm_provider,
787
+ outputs=llm_num_ctx
788
+ )
789
+
790
+ with gr.TabItem("🌐 Browser Settings", id=3):
791
+ with gr.Group():
792
+ with gr.Row():
793
+ use_own_browser = gr.Checkbox(
794
+ label="Use Own Browser",
795
+ value=config['use_own_browser'],
796
+ info="Use your existing browser instance",
797
+ )
798
+ keep_browser_open = gr.Checkbox(
799
+ label="Keep Browser Open",
800
+ value=config['keep_browser_open'],
801
+ info="Keep Browser Open between Tasks",
802
+ )
803
+ headless = gr.Checkbox(
804
+ label="Headless Mode",
805
+ value=config['headless'],
806
+ info="Run browser without GUI",
807
+ )
808
+ disable_security = gr.Checkbox(
809
+ label="Disable Security",
810
+ value=config['disable_security'],
811
+ info="Disable browser security features",
812
+ )
813
+ enable_recording = gr.Checkbox(
814
+ label="Enable Recording",
815
+ value=config['enable_recording'],
816
+ info="Enable saving browser recordings",
817
+ )
818
+
819
+ with gr.Row():
820
+ window_w = gr.Number(
821
+ label="Window Width",
822
+ value=config['window_w'],
823
+ info="Browser window width",
824
+ )
825
+ window_h = gr.Number(
826
+ label="Window Height",
827
+ value=config['window_h'],
828
+ info="Browser window height",
829
+ )
830
+
831
+ save_recording_path = gr.Textbox(
832
+ label="Recording Path",
833
+ placeholder="e.g. ./tmp/record_videos",
834
+ value=config['save_recording_path'],
835
+ info="Path to save browser recordings",
836
+ interactive=True, # Allow editing only if recording is enabled
837
+ )
838
+
839
+ save_trace_path = gr.Textbox(
840
+ label="Trace Path",
841
+ placeholder="e.g. ./tmp/traces",
842
+ value=config['save_trace_path'],
843
+ info="Path to save Agent traces",
844
+ interactive=True,
845
+ )
846
+
847
+ save_agent_history_path = gr.Textbox(
848
+ label="Agent History Save Path",
849
+ placeholder="e.g., ./tmp/agent_history",
850
+ value=config['save_agent_history_path'],
851
+ info="Specify the directory where agent history should be saved.",
852
+ interactive=True,
853
+ )
854
+
855
+ with gr.TabItem("🤖 Run Agent", id=4):
856
+ task = gr.Textbox(
857
+ label="Task Description",
858
+ lines=4,
859
+ placeholder="Enter your task here...",
860
+ value=config['task'],
861
+ info="Describe what you want the agent to do",
862
+ )
863
+ add_infos = gr.Textbox(
864
+ label="Additional Information",
865
+ lines=3,
866
+ placeholder="Add any helpful context or instructions...",
867
+ info="Optional hints to help the LLM complete the task",
868
+ )
869
+
870
+ with gr.Row():
871
+ run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2)
872
+ stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1)
873
+
874
+ with gr.Row():
875
+ browser_view = gr.HTML(
876
+ value="<h1 style='width:80vw; height:50vh'>Waiting for browser session...</h1>",
877
+ label="Live Browser View",
878
+ )
879
+
880
+ with gr.TabItem("🧐 Deep Research", id=5):
881
+ research_task_input = gr.Textbox(label="Research Task", lines=5, value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.")
882
+ with gr.Row():
883
+ max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, precision=0) # precision=0 确保是整数
884
+ max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, precision=0) # precision=0 确保是整数
885
+ with gr.Row():
886
+ research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2)
887
+ stop_research_button = gr.Button("⏹️ Stop", variant="stop", scale=1)
888
+ markdown_output_display = gr.Markdown(label="Research Report")
889
+ markdown_download = gr.File(label="Download Research Report")
890
+
891
+
892
+ with gr.TabItem("📊 Results", id=6):
893
+ with gr.Group():
894
+
895
+ recording_display = gr.Video(label="Latest Recording")
896
+
897
+ gr.Markdown("### Results")
898
+ with gr.Row():
899
+ with gr.Column():
900
+ final_result_output = gr.Textbox(
901
+ label="Final Result", lines=3, show_label=True
902
+ )
903
+ with gr.Column():
904
+ errors_output = gr.Textbox(
905
+ label="Errors", lines=3, show_label=True
906
+ )
907
+ with gr.Row():
908
+ with gr.Column():
909
+ model_actions_output = gr.Textbox(
910
+ label="Model Actions", lines=3, show_label=True
911
+ )
912
+ with gr.Column():
913
+ model_thoughts_output = gr.Textbox(
914
+ label="Model Thoughts", lines=3, show_label=True
915
+ )
916
+
917
+ trace_file = gr.File(label="Trace File")
918
+
919
+ agent_history_file = gr.File(label="Agent History")
920
+
921
+ # Bind the stop button click event after errors_output is defined
922
+ stop_button.click(
923
+ fn=stop_agent,
924
+ inputs=[],
925
+ outputs=[errors_output, stop_button, run_button],
926
+ )
927
+
928
+ # Run button click handler
929
+ run_button.click(
930
+ fn=run_with_stream,
931
+ inputs=[
932
+ agent_type, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key,
933
+ use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h,
934
+ save_recording_path, save_agent_history_path, save_trace_path, # Include the new path
935
+ enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method
936
+ ],
937
+ outputs=[
938
+ browser_view, # Browser view
939
+ final_result_output, # Final result
940
+ errors_output, # Errors
941
+ model_actions_output, # Model actions
942
+ model_thoughts_output, # Model thoughts
943
+ recording_display, # Latest recording
944
+ trace_file, # Trace file
945
+ agent_history_file, # Agent history file
946
+ stop_button, # Stop button
947
+ run_button # Run button
948
+ ],
949
+ )
950
+
951
+ # Run Deep Research
952
+ research_button.click(
953
+ fn=run_deep_search,
954
+ inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless],
955
+ outputs=[markdown_output_display, markdown_download, stop_research_button, research_button]
956
+ )
957
+ # Bind the stop button click event after errors_output is defined
958
+ stop_research_button.click(
959
+ fn=stop_research_agent,
960
+ inputs=[],
961
+ outputs=[stop_research_button, research_button],
962
+ )
963
+
964
+ with gr.TabItem("🎥 Recordings", id=7):
965
+ def list_recordings(save_recording_path):
966
+ if not os.path.exists(save_recording_path):
967
+ return []
968
+
969
+ # Get all video files
970
+ recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
971
+
972
+ # Sort recordings by creation time (oldest first)
973
+ recordings.sort(key=os.path.getctime)
974
+
975
+ # Add numbering to the recordings
976
+ numbered_recordings = []
977
+ for idx, recording in enumerate(recordings, start=1):
978
+ filename = os.path.basename(recording)
979
+ numbered_recordings.append((recording, f"{idx}. {filename}"))
980
+
981
+ return numbered_recordings
982
+
983
+ recordings_gallery = gr.Gallery(
984
+ label="Recordings",
985
+ value=list_recordings(config['save_recording_path']),
986
+ columns=3,
987
+ height="auto",
988
+ object_fit="contain"
989
+ )
990
+
991
+ refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary")
992
+ refresh_button.click(
993
+ fn=list_recordings,
994
+ inputs=save_recording_path,
995
+ outputs=recordings_gallery
996
+ )
997
+
998
+ with gr.TabItem("📁 Configuration", id=8):
999
+ with gr.Group():
1000
+ config_file_input = gr.File(
1001
+ label="Load Config File",
1002
+ file_types=[".pkl"],
1003
+ interactive=True
1004
+ )
1005
+
1006
+ load_config_button = gr.Button("Load Existing Config From File", variant="primary")
1007
+ save_config_button = gr.Button("Save Current Config", variant="primary")
1008
+
1009
+ config_status = gr.Textbox(
1010
+ label="Status",
1011
+ lines=2,
1012
+ interactive=False
1013
+ )
1014
+
1015
+ load_config_button.click(
1016
+ fn=update_ui_from_config,
1017
+ inputs=[config_file_input],
1018
+ outputs=[
1019
+ agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method,
1020
+ llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key,
1021
+ use_own_browser, keep_browser_open, headless, disable_security, enable_recording,
1022
+ window_w, window_h, save_recording_path, save_trace_path, save_agent_history_path,
1023
+ task, config_status
1024
+ ]
1025
+ )
1026
+
1027
+ save_config_button.click(
1028
+ fn=save_current_config,
1029
+ inputs=[
1030
+ agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method,
1031
+ llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key,
1032
+ use_own_browser, keep_browser_open, headless, disable_security,
1033
+ enable_recording, window_w, window_h, save_recording_path, save_trace_path,
1034
+ save_agent_history_path, task,
1035
+ ],
1036
+ outputs=[config_status]
1037
+ )
1038
+
1039
+
1040
+ # Attach the callback to the LLM provider dropdown
1041
+ llm_provider.change(
1042
+ lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url),
1043
+ inputs=[llm_provider, llm_api_key, llm_base_url],
1044
+ outputs=llm_model_name
1045
+ )
1046
+
1047
+ # Add this after defining the components
1048
+ enable_recording.change(
1049
+ lambda enabled: gr.update(interactive=enabled),
1050
+ inputs=enable_recording,
1051
+ outputs=save_recording_path
1052
+ )
1053
+
1054
+ use_own_browser.change(fn=close_global_browser)
1055
+ keep_browser_open.change(fn=close_global_browser)
1056
+
1057
+ return demo
1058
+
1059
+ def main():
1060
+ parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
1061
+ parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
1062
+ parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
1063
+ parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
1064
+ parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode")
1065
+ args = parser.parse_args()
1066
+
1067
+ config_dict = default_config()
1068
+
1069
+ demo = create_ui(config_dict, theme_name=args.theme)
1070
+ demo.launch(server_name=args.ip, server_port=args.port, share=True, pwa=True)
1071
+
1072
+ if __name__ == '__main__':
1073
+ main()