NihalGazi commited on
Commit
58104b7
Β·
verified Β·
1 Parent(s): 1fc0179

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -144
app.py CHANGED
@@ -1,144 +1,146 @@
1
- import os
2
- import time
3
- import random
4
- import subprocess
5
-
6
- import gradio as gr
7
- from playwright.sync_api import Error as PlaywrightError
8
- from playwright_extra import PlaywrightExtra
9
- from playwright_extra.plugins.stealth import stealth_plugin
10
- from playwright_sync_stealth import stealth_sync
11
-
12
- # β€”β€”β€” ROTATING USER-AGENTS β€”β€”β€”
13
- UA_POOL = [
14
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
15
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
16
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
17
- # add more as needed
18
- ]
19
-
20
- # β€”β€”β€” SETUP Playwright-Extra + Stealth β€”β€”β€”
21
- pw = PlaywrightExtra()
22
- pw.use(stealth_plugin())
23
-
24
- def install_playwright_browser_if_needed():
25
- """Ensures Chromium is installed for Playwright."""
26
- try:
27
- with pw.chromium.launch(headless=True) as browser:
28
- return True
29
- except PlaywrightError:
30
- try:
31
- subprocess.run(
32
- ["playwright", "install", "chromium"],
33
- check=True, capture_output=True, text=True
34
- )
35
- with pw.chromium.launch(headless=True) as browser:
36
- return True
37
- except Exception:
38
- return False
39
-
40
- # INITIALIZE
41
- print("Checking Playwright Chromium…")
42
- BROWSER_READY = install_playwright_browser_if_needed()
43
- print("Browser ready:", BROWSER_READY)
44
-
45
- def take_web_screenshot(url: str, full_page: bool, use_proxy: bool):
46
- if not BROWSER_READY:
47
- return None, "Error: Playwright browser not installed."
48
-
49
- if not url.strip():
50
- return None, "Please provide a valid URL."
51
-
52
- if not (url.startswith("http://") or url.startswith("https://")):
53
- url = "https://" + url.strip()
54
-
55
- # Launch headless (you can set headless=False for debugging)
56
- with pw.chromium.launch(
57
- headless=True,
58
- args=[
59
- "--disable-blink-features=AutomationControlled",
60
- "--no-sandbox",
61
- "--disable-infobars",
62
- ]
63
- ) as browser:
64
-
65
- # Build context with random UA, locale, optional proxy
66
- proxy_cfg = {"server": "http://USER:[email protected]:8000"} if use_proxy else None
67
- context = browser.new_context(
68
- user_agent=random.choice(UA_POOL),
69
- locale="en-US",
70
- timezone_id="Asia/Kolkata",
71
- extra_http_headers={
72
- "accept-language": "en-US,en;q=0.9",
73
- "accept-encoding": "gzip, deflate, br",
74
- },
75
- proxy=proxy_cfg
76
- )
77
-
78
- page = context.new_page()
79
-
80
- # Stealth injection
81
- stealth_sync(page)
82
- page.add_init_script(
83
- "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
84
- )
85
-
86
- try:
87
- # Navigate & emulate human delay
88
- page.goto(url, timeout=60000, wait_until="load")
89
- page.wait_for_timeout(random.randint(1000, 3000))
90
- # Optional simple mouse movement
91
- page.mouse.move(100, 100, steps=10)
92
-
93
- # Build screenshot path
94
- os.makedirs("screenshots", exist_ok=True)
95
- timestamp = time.strftime("%Y%m%d-%H%M%S")
96
- safe_name = "".join(c if c.isalnum() else "_" for c in url.split("://")[-1])[:50]
97
- path = f"screenshots/{safe_name}_{timestamp}.png"
98
-
99
- # Capture
100
- page.screenshot(path=path, full_page=full_page)
101
- context.close()
102
- return path, f"βœ… Screenshot captured: {url}"
103
-
104
- except PlaywrightError as e:
105
- msg = str(e)
106
- if "net::ERR_NAME_NOT_RESOLVED" in msg:
107
- err = "Domain not found."
108
- elif "Timeout" in msg:
109
- err = "Page load timeout."
110
- elif "SSL" in msg.lower():
111
- err = "SSL error."
112
- else:
113
- err = "Playwright error."
114
- return None, f"❌ {err} Details: {msg[:200]}"
115
-
116
- except Exception as e:
117
- return None, f"❌ Unexpected error: {str(e)[:200]}"
118
-
119
- # β€”β€”β€” GRADIO INTERFACE β€”β€”β€”
120
- iface = gr.Interface(
121
- fn=take_web_screenshot,
122
- inputs=[
123
- gr.Textbox(label="Website URL", placeholder="e.g. example.com"),
124
- gr.Checkbox(label="Full-page screenshot?", value=False),
125
- gr.Checkbox(label="Use proxy?", value=False),
126
- ],
127
- outputs=[
128
- gr.Image(type="filepath", label="Screenshot"),
129
- gr.Textbox(label="Status"),
130
- ],
131
- title="Stealth Screenshot Taker πŸ“Έ",
132
- description=(
133
- "Captures screenshots while evading bot-detection. "
134
- "Enables rotating UAs, stealth plugin, human-like delays, and optional proxy."
135
- ),
136
- examples=[
137
- ["gradio.app", False, False],
138
- ["wikipedia.org", True, False],
139
- ],
140
- allow_flagging="never",
141
- )
142
-
143
- if __name__ == "__main__":
144
- iface.launch()
 
 
 
1
+ import gradio as gr
2
+ from playwright.sync_api import sync_playwright, Error as PlaywrightError
3
+ import subprocess
4
+ import os
5
+ import time
6
+
7
+ # --- Helper to ensure Playwright browser is installed ---
8
+ # This function attempts to install the browser if not found.
9
+ # It's best-effort for standard Gradio spaces; a Dockerfile is more robust.
10
+ def install_playwright_browser_if_needed():
11
+ try:
12
+ # Check if browser is callable by Playwright
13
+ with sync_playwright() as p:
14
+ try:
15
+ browser = p.chromium.launch(headless=True)
16
+ browser.close()
17
+ print("Playwright Chromium browser is available.")
18
+ return True
19
+ except PlaywrightError:
20
+ print("Playwright Chromium browser not found or not executable. Attempting installation.")
21
+
22
+ # If not found, try to install.
23
+ # The `packages.txt` should have installed most system dependencies.
24
+ # This command installs the browser itself into Playwright's managed location.
25
+ print("Attempting to install Playwright Chromium browser executable...")
26
+ try:
27
+ # Using subprocess to run the playwright install command
28
+ subprocess.run(["playwright", "install", "chromium"], check=True, capture_output=True, text=True)
29
+ print("Playwright Chromium executable installed successfully.")
30
+ # Verify again
31
+ with sync_playwright() as p:
32
+ browser = p.chromium.launch(headless=True)
33
+ browser.close()
34
+ print("Playwright Chromium successfully verified after installation.")
35
+ return True
36
+ except subprocess.CalledProcessError as e:
37
+ print(f"Playwright install chromium failed. STDERR: {e.stderr} STDOUT: {e.stdout}")
38
+ return False
39
+ except FileNotFoundError:
40
+ print("Playwright command not found. Ensure 'playwright' is in requirements.txt and installed.")
41
+ return False
42
+ except Exception as e:
43
+ print(f"An error occurred during Playwright browser setup: {e}")
44
+ return False
45
+
46
+ # Run browser installation check when the app starts.
47
+ # Logs will appear in the Hugging Face Space "Logs" tab.
48
+ print("Initializing Space: Checking/Installing Playwright browser...")
49
+ BROWSER_READY = install_playwright_browser_if_needed()
50
+ if BROWSER_READY:
51
+ print("Browser is ready.")
52
+ else:
53
+ print("WARNING: Browser installation failed or could not be verified. Screenshot functionality may not work.")
54
+
55
+ # --- Screenshot function ---
56
+ def take_web_screenshot(url: str):
57
+ if not BROWSER_READY:
58
+ return None, "Error: Playwright browser (Chromium) is not properly installed or configured. Cannot take screenshot."
59
+
60
+ if not url:
61
+ return None, "Please enter a website URL."
62
+
63
+ # Prepend https:// if no scheme is present, as Playwright requires it.
64
+ if not (url.startswith("http://") or url.startswith("https://")):
65
+ url = "https://" + url
66
+
67
+ screenshot_path = None # Initialize here
68
+
69
+ try:
70
+ with sync_playwright() as p:
71
+ browser = p.chromium.launch(headless=True)
72
+ context = browser.new_context(
73
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
74
+ )
75
+ page = context.new_page()
76
+
77
+ # Set a common viewport size
78
+ page.set_viewport_size({"width": 1280, "height": 720})
79
+
80
+ print(f"Navigating to URL: {url}")
81
+ # Increased timeout and wait_until 'domcontentloaded' or 'load' can be more reliable
82
+ page.goto(url, timeout=60000, wait_until="domcontentloaded") # 60 seconds timeout
83
+
84
+ # Give some time for lazy-loaded elements if necessary, though 'networkidle' can be slow/unreliable
85
+ # page.wait_for_timeout(2000) # Optional: 2 seconds, adjust as needed
86
+
87
+ # Create a unique filename for the screenshot
88
+ os.makedirs("screenshots", exist_ok=True) # Ensure screenshots directory exists
89
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
90
+ # Simple sanitization for filename
91
+ sanitized_url_part = "".join(c if c.isalnum() else "_" for c in url.split("://")[-1])[:50]
92
+ screenshot_path = f"screenshots/screenshot_{sanitized_url_part}_{timestamp}.png"
93
+
94
+ print(f"Taking screenshot and saving to: {screenshot_path}")
95
+ page.screenshot(path=screenshot_path, full_page=False) # Captures the viewport
96
+
97
+ browser.close()
98
+ print("Screenshot successful.")
99
+ return screenshot_path, f"Screenshot of {url} captured successfully!"
100
+
101
+ except PlaywrightError as e:
102
+ error_message_detail = str(e)
103
+ print(f"Playwright error: {error_message_detail}")
104
+ if "net::ERR_NAME_NOT_RESOLVED" in error_message_detail:
105
+ status_message = f"Error: The URL '{url}' could not be resolved. Please check the domain name."
106
+ elif "Timeout" in error_message_detail:
107
+ status_message = f"Error: Timeout while loading '{url}'. The page might be too slow, offline, or protected."
108
+ elif "SSL_ERROR" in error_message_detail or "ssl_error" in error_message_detail.lower():
109
+ status_message = f"Error: SSL certificate issue with '{url}'. The site might be insecure or misconfigured."
110
+ else:
111
+ status_message = f"Error: Could not take screenshot of '{url}'. Details: {error_message_detail[:200]}..."
112
+ # If a screenshot was partially created or an old one exists, don't return it on error
113
+ return None, status_message
114
+ except Exception as e:
115
+ print(f"An unexpected error occurred: {e}")
116
+ return None, f"An unexpected error occurred: {str(e)[:200]}..."
117
+
118
+ # --- Gradio Interface ---
119
+ iface = gr.Interface(
120
+ fn=take_web_screenshot,
121
+ inputs=gr.Textbox(
122
+ label="Website URL",
123
+ placeholder="e.g., https://www.example.com or example.com"
124
+ ),
125
+ outputs=[
126
+ gr.Image(type="filepath", label="Website Screenshot"),
127
+ gr.Textbox(label="Status")
128
+ ],
129
+ title="Website Screenshot Taker πŸ“Έ",
130
+ description=(
131
+ "Enter a full website URL to capture a screenshot of its current view. "
132
+ "Prefix with http:// or https:// for best results."
133
+ ),
134
+ examples=[
135
+ ["https://gradio.app"],
136
+ ["huggingface.co/spaces"],
137
+ ["en.wikipedia.org/wiki/Python_(programming_language)"]
138
+ ],
139
+ allow_flagging="never",
140
+ css=".gradio-container {max-width: 960px !important; margin: auto !important;}" # Optional: center and limit width
141
+ )
142
+
143
+ # --- Main launch ---
144
+ if __name__ == "__main__":
145
+ print("Starting Gradio application...")
146
+ iface.launch()