twilio_whatapp_api

Sleeping

App Files Files Community

Arafath10 commited on Oct 11, 2024

Commit

4bb67f4

verified ·

1 Parent(s): 53a7825

Update user_guide_sync.py

Browse files

Files changed (1) hide show

user_guide_sync.py +42 -42

user_guide_sync.py CHANGED Viewed

@@ -15,59 +15,59 @@ from llama_index import StorageContext, load_index_from_storage
 #os.environ["OPENAI_API_KEY"]
-# URL of the page to scrape
-url = 'https://help.storemate.cloud/docs/reports/'
-def get_web_data(valid_links):
-    for url in valid_links:
-      try:
-          # Send a GET request to the URL
-          response = requests.get(url)
-          # Parse the page content with BeautifulSoup
-          soup = BeautifulSoup(response.content, 'html.parser')
-          # Find the title and section content
-          title = soup.find('h1').get_text()
-          # Find the section with the title "Renew Package Subscription"
-          section = soup.find('h1').find_next('div')
-          # Extract the text content from the section
-          section_text = section.get_text().strip()
-          section_text = section_text + f"\nmore detail link : {url}"
-          file = open(f"user_guide/{title}.txt","w")
-          file.write(f"{title}\n{section_text}")
-          file.close()
-      except:
-        pass
-    print("data collected")
 def get_base_links():
-    # Send a GET request to the URL
-    response = requests.get(url)
     # Parse the page content with BeautifulSoup
     soup = BeautifulSoup(response.content, 'html.parser')
     # Find all <a> tags with href attributes
     links = soup.find_all('a', href=True)
     valid_links = []
-    # Extract and print all the URLs
     for link in links:
-        if "https://help.storemate.cloud/docs/" in str(link):
-           valid_links.append(link['href'])
-    print("base links collected")
-    get_web_data(valid_links)
 def update_user_guide():
@@ -77,8 +77,8 @@ def update_user_guide():
     #     index = load_index_from_storage(storage_context=storage_context)
     #     print("loaded")
     # except:
-    documents = SimpleDirectoryReader("user_guide").load_data()
     index = VectorStoreIndex.from_documents(documents)
     index.storage_context.persist("llama_index")
     print("index created")
-    return "done"

 #os.environ["OPENAI_API_KEY"]
+import concurrent.futures
+# URL of the page to scrape
+base_url = 'https://help.storemate.cloud/docs/reports/'
+def fetch_web_data(url):
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url)
+        # Parse the page content with BeautifulSoup
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Find the title and section content
+        title = soup.find('h1').get_text()
+        # Find the section with the title "Renew Package Subscription"
+        section = soup.find('h1').find_next('div').find_next('div')
+        # Extract the text content from the section
+        section_text = section.get_text().strip()
+        section_text = section_text + f"\nMore detail link: {url}"
+        # Save the data into a text file
+        with open(f"user_guide/{title}.txt", "w") as file:
+            file.write(f"{title}\n{section_text}")
+    except Exception as e:
+        print(f"Failed to fetch data from {url}: {e}")
 def get_base_links():
+    # Send a GET request to the base URL
+    response = requests.get(base_url)
     # Parse the page content with BeautifulSoup
     soup = BeautifulSoup(response.content, 'html.parser')
     # Find all <a> tags with href attributes
     links = soup.find_all('a', href=True)
+    # Collect all valid links
     valid_links = []
     for link in links:
+        href = link['href']
+        if href.startswith("https://help.storemate.cloud/docs/"):
+            valid_links.append(href)
+    print("Base links collected")
+    # Use ThreadPoolExecutor to fetch web data in parallel
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        executor.map(fetch_web_data, valid_links)
 def update_user_guide():
     #     index = load_index_from_storage(storage_context=storage_context)
     #     print("loaded")
     # except:
+    documents = SimpleDirectoryReader("data/user_guid").load_data()
     index = VectorStoreIndex.from_documents(documents)
     index.storage_context.persist("llama_index")
     print("index created")
+    return "done"