Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Feb 26

Commit

d95c7be

verified ·

1 Parent(s): 62319f5

Update file_loader.py

Browse files

Files changed (1) hide show

file_loader.py +3 -82

file_loader.py CHANGED Viewed

@@ -9,91 +9,12 @@ from helpers import (
     list_docx_files,  # Lấy danh sách file .docx
     get_splits,  # Xử lý file docx thành splits
     get_json_splits_only,  # Xử lý file JSON (FAQ)
-    # scrape_website,  # Xử lý dữ liệu từ web
 )
 import json
-os.system("playwright install chromium")
-import asyncio
-from urllib.parse import urljoin
-from playwright.async_api import async_playwright
-from langchain_community.document_loaders import AsyncHtmlLoader
-from langchain_community.document_transformers import Html2TextTransformer
-from tqdm.asyncio import tqdm
-SCRAPED_DATA_PATH = "scraped_data.json"
-# ----------- ASYNC SCRAPING FUNCTIONS -----------
-async def _fetch_urls(base_url):
-    """Extract all links from a JavaScript-rendered webpage."""
-    urls = set()
-    try:
-        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
-            page = await browser.new_page()
-            await page.goto(base_url)
-            await page.wait_for_load_state("networkidle")
-            links = await page.locator("a").all()
-            for link in links:
-                href = await link.get_attribute("href")
-                if href and "#" not in href:
-                    full_url = urljoin(base_url, href)
-                    if full_url.startswith(base_url):
-                        urls.add(full_url)
-            await browser.close()
-    except Exception as e:
-        print(f"⚠️ Không thể truy cập {base_url}: {e}")
-    return list(urls)
-async def _fetch_web_content(urls):
-    """Fetch HTML content and convert it to text, with a progress bar."""
-    docs = []
-    progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
-    for page_url in urls:
-        try:
-            # Replace these with your actual async loader/transformer.
-            loader = AsyncHtmlLoader(page_url)
-            html2text = Html2TextTransformer()
-            html = await loader.aload()
-            doc = html2text.transform_documents(html)
-            docs.extend(doc)
-        except Exception as e:
-            print(f"Error loading {page_url}: {e}")
-        progress_bar.update(1)
-    progress_bar.close()
-    return docs
-async def scrape_website(base_urls):
-    """
-    Scrapes a list of base URLs and extracts their content.
-    """
-    all_urls = []
-    for base_url in base_urls:
-        urls = await _fetch_urls(base_url)
-        all_urls.extend(urls)
-    docs = await _fetch_web_content(all_urls)
-    return docs
-async def get_scraped_data(base_urls):
-    """
-    Automatically load scraped data from file if available;
-    otherwise, scrape and cache it.
-    """
-    if os.path.exists(SCRAPED_DATA_PATH):
-        print("🔄 Loading scraped website contents from file...")
-        with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f:
-            return json.load(f)
-    print("🌍 Scraping websites...")
-    website_contents = await scrape_website(base_urls)
-    with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f:
-        json.dump(website_contents, f, ensure_ascii=False, indent=4)
-    return website_contents
-async def get_vectorstore():
     ### Xử lý tất cả các tài liệu và nhét vào database
     folder_path = "syllabus_nct_word_format/"
     docx_files = list_docx_files(folder_path)
@@ -103,7 +24,7 @@ async def get_vectorstore():
     #
     base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
     # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
-    website_contents = await get_scraped_data(base_urls=base_urls)
     all_splits += website_contents
     print('Feeding .docx files')

     list_docx_files,  # Lấy danh sách file .docx
     get_splits,  # Xử lý file docx thành splits
     get_json_splits_only,  # Xử lý file JSON (FAQ)
+    get_web_documents,  # Xử lý dữ liệu từ web
 )
 import json
+def get_vectorstore():
     ### Xử lý tất cả các tài liệu và nhét vào database
     folder_path = "syllabus_nct_word_format/"
     docx_files = list_docx_files(folder_path)
     #
     base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
     # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
+    website_contents = get_web_documents(base_urls=base_urls)
     all_splits += website_contents
     print('Feeding .docx files')