Kevin Hu commited on
Commit
71da872
·
1 Parent(s): 44731b3

add component invoke (#2967)

Browse files

### What problem does this PR solve?

#2908

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

agent/component/crawler.py CHANGED
@@ -18,6 +18,7 @@ import asyncio
18
  from crawl4ai import AsyncWebCrawler
19
  from agent.component.base import ComponentBase, ComponentParamBase
20
 
 
21
  class CrawlerParam(ComponentParamBase):
22
  """
23
  Define the Crawler component parameters.
@@ -25,9 +26,11 @@ class CrawlerParam(ComponentParamBase):
25
 
26
  def __init__(self):
27
  super().__init__()
 
 
28
 
29
  def check(self):
30
- return True
31
 
32
 
33
  class Crawler(ComponentBase, ABC):
@@ -46,7 +49,6 @@ class Crawler(ComponentBase, ABC):
46
  except Exception as e:
47
  return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
48
 
49
-
50
  async def get_web(self, url):
51
  proxy = self._param.proxy if self._param.proxy else None
52
  async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
@@ -55,16 +57,13 @@ class Crawler(ComponentBase, ABC):
55
  bypass_cache=True
56
  )
57
 
58
- match self._param.extract_type:
59
- case 'html':
60
- return result.cleaned_html
61
- case 'markdown':
62
- return result.markdown
63
- case 'content':
64
- return result.extracted_content
65
- case _:
66
- return result.markdown
67
- # print(result.markdown)
68
 
69
 
70
 
 
18
  from crawl4ai import AsyncWebCrawler
19
  from agent.component.base import ComponentBase, ComponentParamBase
20
 
21
+
22
  class CrawlerParam(ComponentParamBase):
23
  """
24
  Define the Crawler component parameters.
 
26
 
27
  def __init__(self):
28
  super().__init__()
29
+ self.proxy = None
30
+ self.extract_type = "markdown"
31
 
32
  def check(self):
33
+ self.check_valid_value(self.extract_type, "Type of content from the crawler", ['html', 'markdown', 'content'])
34
 
35
 
36
  class Crawler(ComponentBase, ABC):
 
49
  except Exception as e:
50
  return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
51
 
 
52
  async def get_web(self, url):
53
  proxy = self._param.proxy if self._param.proxy else None
54
  async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
 
57
  bypass_cache=True
58
  )
59
 
60
+ if self._param.extract_type == 'html':
61
+ return result.cleaned_html
62
+ elif self._param.extract_type == 'markdown':
63
+ return result.markdown
64
+ elif self._param.extract_type == 'content':
65
+ result.extracted_content
66
+ return result.markdown
 
 
 
67
 
68
 
69
 
agent/component/invoke.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import json
17
+ from abc import ABC
18
+
19
+ import requests
20
+
21
+ from agent.component.base import ComponentBase, ComponentParamBase
22
+
23
+
24
+ class InvokeParam(ComponentParamBase):
25
+ """
26
+ Define the Crawler component parameters.
27
+ """
28
+
29
+ def __init__(self):
30
+ super().__init__()
31
+ self.proxy = None
32
+ self.headers = ""
33
+ self.method = "get"
34
+ self.variables = []
35
+ self.url = ""
36
+ self.timeout = 60
37
+
38
+ def check(self):
39
+ self.check_valid_value(self.method.lower(), "Type of content from the crawler", ['get', 'post', 'put'])
40
+ self.check_empty(self.url, "End point URL")
41
+ self.check_positive_integer(self.timeout, "Timeout time in second")
42
+
43
+
44
+ class Invoke(ComponentBase, ABC):
45
+ component_name = "Invoke"
46
+
47
+ def _run(self, history, **kwargs):
48
+ args = {}
49
+ for para in self._param.variables:
50
+ if para.get("component_id"):
51
+ cpn = self._canvas.get_component(para["component_id"])["obj"]
52
+ _, out = cpn.output(allow_partial=False)
53
+ args[para["key"]] = "\n".join(out["content"])
54
+ else:
55
+ args[para["key"]] = "\n".join(para["value"])
56
+
57
+ url = self._param.url.strip()
58
+ if url.find("http") != 0:
59
+ url = "http://" + url
60
+
61
+ method = self._param.method.lower()
62
+ headers = {}
63
+ if self._param.headers:
64
+ headers = json.loads(self._param.headers)
65
+ proxies = None
66
+ if self._param.proxy:
67
+ proxies = {"http": self._param.proxy, "https": self._param.proxy}
68
+
69
+ if method == 'get':
70
+ response = requests.get(url=url,
71
+ params=args,
72
+ headers=headers,
73
+ proxies=proxies,
74
+ timeout=self._param.timeout)
75
+ return Invoke.be_output(response.text)
76
+
77
+ if method == 'put':
78
+ response = requests.put(url=url,
79
+ data=args,
80
+ headers=headers,
81
+ proxies=proxies,
82
+ timeout=self._param.timeout)
83
+
84
+ return Invoke.be_output(response.text)