Arafath10 commited on
Commit
4bb67f4
·
verified ·
1 Parent(s): 53a7825

Update user_guide_sync.py

Browse files
Files changed (1) hide show
  1. user_guide_sync.py +42 -42
user_guide_sync.py CHANGED
@@ -15,59 +15,59 @@ from llama_index import StorageContext, load_index_from_storage
15
 
16
 
17
  #os.environ["OPENAI_API_KEY"]
 
18
 
 
 
19
 
20
 
21
- # URL of the page to scrape
22
- url = 'https://help.storemate.cloud/docs/reports/'
23
-
24
-
25
- def get_web_data(valid_links):
26
- for url in valid_links:
27
- try:
28
- # Send a GET request to the URL
29
- response = requests.get(url)
30
-
31
- # Parse the page content with BeautifulSoup
32
- soup = BeautifulSoup(response.content, 'html.parser')
33
-
34
- # Find the title and section content
35
- title = soup.find('h1').get_text()
36
-
37
- # Find the section with the title "Renew Package Subscription"
38
- section = soup.find('h1').find_next('div')
39
- # Extract the text content from the section
40
- section_text = section.get_text().strip()
41
- section_text = section_text + f"\nmore detail link : {url}"
42
-
43
- file = open(f"user_guide/{title}.txt","w")
44
- file.write(f"{title}\n{section_text}")
45
- file.close()
46
- except:
47
- pass
48
- print("data collected")
49
 
50
 
51
  def get_base_links():
52
- # Send a GET request to the URL
53
- response = requests.get(url)
54
-
55
  # Parse the page content with BeautifulSoup
56
  soup = BeautifulSoup(response.content, 'html.parser')
57
-
58
  # Find all <a> tags with href attributes
59
  links = soup.find_all('a', href=True)
60
-
 
61
  valid_links = []
62
- # Extract and print all the URLs
63
  for link in links:
64
- if "https://help.storemate.cloud/docs/" in str(link):
65
- valid_links.append(link['href'])
66
-
67
- print("base links collected")
68
-
69
- get_web_data(valid_links)
70
 
 
 
 
71
 
72
 
73
  def update_user_guide():
@@ -77,8 +77,8 @@ def update_user_guide():
77
  # index = load_index_from_storage(storage_context=storage_context)
78
  # print("loaded")
79
  # except:
80
- documents = SimpleDirectoryReader("user_guide").load_data()
81
  index = VectorStoreIndex.from_documents(documents)
82
  index.storage_context.persist("llama_index")
83
  print("index created")
84
- return "done"
 
15
 
16
 
17
  #os.environ["OPENAI_API_KEY"]
18
+ import concurrent.futures
19
 
20
+ # URL of the page to scrape
21
+ base_url = 'https://help.storemate.cloud/docs/reports/'
22
 
23
 
24
+ def fetch_web_data(url):
25
+ try:
26
+ # Send a GET request to the URL
27
+ response = requests.get(url)
28
+
29
+ # Parse the page content with BeautifulSoup
30
+ soup = BeautifulSoup(response.content, 'html.parser')
31
+
32
+ # Find the title and section content
33
+ title = soup.find('h1').get_text()
34
+
35
+ # Find the section with the title "Renew Package Subscription"
36
+ section = soup.find('h1').find_next('div').find_next('div')
37
+
38
+ # Extract the text content from the section
39
+ section_text = section.get_text().strip()
40
+ section_text = section_text + f"\nMore detail link: {url}"
41
+
42
+ # Save the data into a text file
43
+ with open(f"user_guide/{title}.txt", "w") as file:
44
+ file.write(f"{title}\n{section_text}")
45
+ except Exception as e:
46
+ print(f"Failed to fetch data from {url}: {e}")
 
 
 
 
 
47
 
48
 
49
  def get_base_links():
50
+ # Send a GET request to the base URL
51
+ response = requests.get(base_url)
52
+
53
  # Parse the page content with BeautifulSoup
54
  soup = BeautifulSoup(response.content, 'html.parser')
55
+
56
  # Find all <a> tags with href attributes
57
  links = soup.find_all('a', href=True)
58
+
59
+ # Collect all valid links
60
  valid_links = []
 
61
  for link in links:
62
+ href = link['href']
63
+ if href.startswith("https://help.storemate.cloud/docs/"):
64
+ valid_links.append(href)
65
+
66
+ print("Base links collected")
 
67
 
68
+ # Use ThreadPoolExecutor to fetch web data in parallel
69
+ with concurrent.futures.ThreadPoolExecutor() as executor:
70
+ executor.map(fetch_web_data, valid_links)
71
 
72
 
73
  def update_user_guide():
 
77
  # index = load_index_from_storage(storage_context=storage_context)
78
  # print("loaded")
79
  # except:
80
+ documents = SimpleDirectoryReader("data/user_guid").load_data()
81
  index = VectorStoreIndex.from_documents(documents)
82
  index.storage_context.persist("llama_index")
83
  print("index created")
84
+ return "done"