liuhua liuhua commited on
Commit
311da71
·
1 Parent(s): 1635b00

Fix potential SSRF attack vulnerability (#4334)

Browse files

### What problem does this PR solve?

Fix potential SSRF attack vulnerability

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <[email protected]>

agent/component/crawler.py CHANGED
@@ -41,7 +41,7 @@ class Crawler(ComponentBase, ABC):
41
  ans = self.get_input()
42
  ans = " - ".join(ans["content"]) if "content" in ans else ""
43
  if not is_valid_url(ans):
44
- return Crawler.be_output("")
45
  try:
46
  result = asyncio.run(self.get_web(ans))
47
 
 
41
  ans = self.get_input()
42
  ans = " - ".join(ans["content"]) if "content" in ans else ""
43
  if not is_valid_url(ans):
44
+ return Crawler.be_output("URL not valid")
45
  try:
46
  result = asyncio.run(self.get_web(ans))
47
 
api/utils/web_utils.py CHANGED
@@ -1,4 +1,7 @@
1
  import re
 
 
 
2
  import json
3
  import base64
4
 
@@ -76,5 +79,25 @@ def __get_pdf_from_html(
76
  return base64.b64decode(result["data"])
77
 
78
 
 
 
 
 
 
 
 
79
  def is_valid_url(url: str) -> bool:
80
- return bool(re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ import socket
3
+ from urllib.parse import urlparse
4
+ import ipaddress
5
  import json
6
  import base64
7
 
 
79
  return base64.b64decode(result["data"])
80
 
81
 
82
+ def is_private_ip(ip: str) -> bool:
83
+ try:
84
+ ip_obj = ipaddress.ip_address(ip)
85
+ return ip_obj.is_private
86
+ except ValueError:
87
+ return False
88
+
89
  def is_valid_url(url: str) -> bool:
90
+ if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url):
91
+ return False
92
+ parsed_url = urlparse(url)
93
+ hostname = parsed_url.hostname
94
+
95
+ if not hostname:
96
+ return False
97
+ try:
98
+ ip = socket.gethostbyname(hostname)
99
+ if is_private_ip(ip):
100
+ return False
101
+ except socket.gaierror:
102
+ return False
103
+ return True