mehmet0001 commited on
Commit
1d91f23
·
verified ·
1 Parent(s): 4463689

Update crawl_the_site.py

Browse files
Files changed (1) hide show
  1. crawl_the_site.py +34 -37
crawl_the_site.py CHANGED
@@ -1,38 +1,35 @@
1
- from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- import requests as req
3
- from bs4 import BeautifulSoup
4
- from urllib.parse import urljoin,urlparse
5
- import sys
6
-
7
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=100,length_function=len)
8
-
9
- def get_base(url):
10
- parsed = urlparse(url)
11
- base = f"{parsed.scheme}://{parsed.netloc}"
12
- return base
13
-
14
- def crawl(start,limit):
15
- base_domain = get_base(start)
16
-
17
- links = [start]
18
- visited = []
19
- txt = ""
20
-
21
- while links and len(visited) < limit:
22
- to_visit = links.pop(0)
23
- visited.append(to_visit)
24
-
25
- html = req.get(to_visit).text
26
- soup = BeautifulSoup(html,"lxml")
27
- for anchor in soup.find_all("a",href=True):
28
- sublink = urljoin(to_visit,anchor["href"])
29
- sub_base = get_base(sublink)
30
- if not sublink in visited and base_domain == sub_base:
31
- links.append(sublink)
32
- visited.append(to_visit)
33
- txt += soup.get_text(" ",True) + "\n\n"
34
- sys.stdout.write("\r"+f"Crawling the site %{round((len(visited)/limit)*100)} done.")
35
- sys.stdout.flush()
36
-
37
- txt = text_splitter.split_text(txt)
38
  return txt
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ import requests as req
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin,urlparse
5
+
6
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=100,length_function=len)
7
+
8
+ def get_base(url):
9
+ parsed = urlparse(url)
10
+ base = f"{parsed.scheme}://{parsed.netloc}"
11
+ return base
12
+
13
+ def crawl(start,limit):
14
+ base_domain = get_base(start)
15
+
16
+ links = [start]
17
+ visited = []
18
+ txt = ""
19
+
20
+ while links and len(visited) < limit:
21
+ to_visit = links.pop(0)
22
+ visited.append(to_visit)
23
+
24
+ html = req.get(to_visit).text
25
+ soup = BeautifulSoup(html,"lxml")
26
+ for anchor in soup.find_all("a",href=True):
27
+ sublink = urljoin(to_visit,anchor["href"])
28
+ sub_base = get_base(sublink)
29
+ if not sublink in visited and base_domain == sub_base:
30
+ links.append(sublink)
31
+ visited.append(to_visit)
32
+ txt += soup.get_text(" ",True) + "\n\n"
33
+
34
+ txt = text_splitter.split_text(txt)
 
 
 
35
  return txt