quoc-khanh commited on
Commit
d95c7be
·
verified ·
1 Parent(s): 62319f5

Update file_loader.py

Browse files
Files changed (1) hide show
  1. file_loader.py +3 -82
file_loader.py CHANGED
@@ -9,91 +9,12 @@ from helpers import (
9
  list_docx_files, # Lấy danh sách file .docx
10
  get_splits, # Xử lý file docx thành splits
11
  get_json_splits_only, # Xử lý file JSON (FAQ)
12
- # scrape_website, # Xử lý dữ liệu từ web
13
  )
14
 
15
-
16
  import json
17
 
18
- os.system("playwright install chromium")
19
-
20
- import asyncio
21
- from urllib.parse import urljoin
22
- from playwright.async_api import async_playwright
23
- from langchain_community.document_loaders import AsyncHtmlLoader
24
- from langchain_community.document_transformers import Html2TextTransformer
25
- from tqdm.asyncio import tqdm
26
-
27
- SCRAPED_DATA_PATH = "scraped_data.json"
28
-
29
- # ----------- ASYNC SCRAPING FUNCTIONS -----------
30
- async def _fetch_urls(base_url):
31
- """Extract all links from a JavaScript-rendered webpage."""
32
- urls = set()
33
- try:
34
- async with async_playwright() as p:
35
- browser = await p.chromium.launch(headless=True)
36
- page = await browser.new_page()
37
- await page.goto(base_url)
38
- await page.wait_for_load_state("networkidle")
39
- links = await page.locator("a").all()
40
- for link in links:
41
- href = await link.get_attribute("href")
42
- if href and "#" not in href:
43
- full_url = urljoin(base_url, href)
44
- if full_url.startswith(base_url):
45
- urls.add(full_url)
46
- await browser.close()
47
- except Exception as e:
48
- print(f"⚠️ Không thể truy cập {base_url}: {e}")
49
- return list(urls)
50
-
51
- async def _fetch_web_content(urls):
52
- """Fetch HTML content and convert it to text, with a progress bar."""
53
- docs = []
54
- progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
55
- for page_url in urls:
56
- try:
57
- # Replace these with your actual async loader/transformer.
58
- loader = AsyncHtmlLoader(page_url)
59
- html2text = Html2TextTransformer()
60
- html = await loader.aload()
61
- doc = html2text.transform_documents(html)
62
- docs.extend(doc)
63
- except Exception as e:
64
- print(f"Error loading {page_url}: {e}")
65
- progress_bar.update(1)
66
- progress_bar.close()
67
- return docs
68
-
69
- async def scrape_website(base_urls):
70
- """
71
- Scrapes a list of base URLs and extracts their content.
72
- """
73
- all_urls = []
74
- for base_url in base_urls:
75
- urls = await _fetch_urls(base_url)
76
- all_urls.extend(urls)
77
- docs = await _fetch_web_content(all_urls)
78
- return docs
79
-
80
- async def get_scraped_data(base_urls):
81
- """
82
- Automatically load scraped data from file if available;
83
- otherwise, scrape and cache it.
84
- """
85
- if os.path.exists(SCRAPED_DATA_PATH):
86
- print("🔄 Loading scraped website contents from file...")
87
- with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f:
88
- return json.load(f)
89
-
90
- print("🌍 Scraping websites...")
91
- website_contents = await scrape_website(base_urls)
92
- with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f:
93
- json.dump(website_contents, f, ensure_ascii=False, indent=4)
94
- return website_contents
95
-
96
- async def get_vectorstore():
97
  ### Xử lý tất cả các tài liệu và nhét vào database
98
  folder_path = "syllabus_nct_word_format/"
99
  docx_files = list_docx_files(folder_path)
@@ -103,7 +24,7 @@ async def get_vectorstore():
103
  #
104
  base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
105
  # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
106
- website_contents = await get_scraped_data(base_urls=base_urls)
107
  all_splits += website_contents
108
 
109
  print('Feeding .docx files')
 
9
  list_docx_files, # Lấy danh sách file .docx
10
  get_splits, # Xử lý file docx thành splits
11
  get_json_splits_only, # Xử lý file JSON (FAQ)
12
+ get_web_documents, # Xử lý dữ liệu từ web
13
  )
14
 
 
15
  import json
16
 
17
+ def get_vectorstore():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ### Xử lý tất cả các tài liệu và nhét vào database
19
  folder_path = "syllabus_nct_word_format/"
20
  docx_files = list_docx_files(folder_path)
 
24
  #
25
  base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
26
  # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
27
+ website_contents = get_web_documents(base_urls=base_urls)
28
  all_splits += website_contents
29
 
30
  print('Feeding .docx files')