Spaces:
Sleeping
Sleeping
Update file_loader.py
Browse files- file_loader.py +3 -82
file_loader.py
CHANGED
|
@@ -9,91 +9,12 @@ from helpers import (
|
|
| 9 |
list_docx_files, # Lấy danh sách file .docx
|
| 10 |
get_splits, # Xử lý file docx thành splits
|
| 11 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
| 12 |
-
|
| 13 |
)
|
| 14 |
|
| 15 |
-
|
| 16 |
import json
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
import asyncio
|
| 21 |
-
from urllib.parse import urljoin
|
| 22 |
-
from playwright.async_api import async_playwright
|
| 23 |
-
from langchain_community.document_loaders import AsyncHtmlLoader
|
| 24 |
-
from langchain_community.document_transformers import Html2TextTransformer
|
| 25 |
-
from tqdm.asyncio import tqdm
|
| 26 |
-
|
| 27 |
-
SCRAPED_DATA_PATH = "scraped_data.json"
|
| 28 |
-
|
| 29 |
-
# ----------- ASYNC SCRAPING FUNCTIONS -----------
|
| 30 |
-
async def _fetch_urls(base_url):
|
| 31 |
-
"""Extract all links from a JavaScript-rendered webpage."""
|
| 32 |
-
urls = set()
|
| 33 |
-
try:
|
| 34 |
-
async with async_playwright() as p:
|
| 35 |
-
browser = await p.chromium.launch(headless=True)
|
| 36 |
-
page = await browser.new_page()
|
| 37 |
-
await page.goto(base_url)
|
| 38 |
-
await page.wait_for_load_state("networkidle")
|
| 39 |
-
links = await page.locator("a").all()
|
| 40 |
-
for link in links:
|
| 41 |
-
href = await link.get_attribute("href")
|
| 42 |
-
if href and "#" not in href:
|
| 43 |
-
full_url = urljoin(base_url, href)
|
| 44 |
-
if full_url.startswith(base_url):
|
| 45 |
-
urls.add(full_url)
|
| 46 |
-
await browser.close()
|
| 47 |
-
except Exception as e:
|
| 48 |
-
print(f"⚠️ Không thể truy cập {base_url}: {e}")
|
| 49 |
-
return list(urls)
|
| 50 |
-
|
| 51 |
-
async def _fetch_web_content(urls):
|
| 52 |
-
"""Fetch HTML content and convert it to text, with a progress bar."""
|
| 53 |
-
docs = []
|
| 54 |
-
progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
|
| 55 |
-
for page_url in urls:
|
| 56 |
-
try:
|
| 57 |
-
# Replace these with your actual async loader/transformer.
|
| 58 |
-
loader = AsyncHtmlLoader(page_url)
|
| 59 |
-
html2text = Html2TextTransformer()
|
| 60 |
-
html = await loader.aload()
|
| 61 |
-
doc = html2text.transform_documents(html)
|
| 62 |
-
docs.extend(doc)
|
| 63 |
-
except Exception as e:
|
| 64 |
-
print(f"Error loading {page_url}: {e}")
|
| 65 |
-
progress_bar.update(1)
|
| 66 |
-
progress_bar.close()
|
| 67 |
-
return docs
|
| 68 |
-
|
| 69 |
-
async def scrape_website(base_urls):
|
| 70 |
-
"""
|
| 71 |
-
Scrapes a list of base URLs and extracts their content.
|
| 72 |
-
"""
|
| 73 |
-
all_urls = []
|
| 74 |
-
for base_url in base_urls:
|
| 75 |
-
urls = await _fetch_urls(base_url)
|
| 76 |
-
all_urls.extend(urls)
|
| 77 |
-
docs = await _fetch_web_content(all_urls)
|
| 78 |
-
return docs
|
| 79 |
-
|
| 80 |
-
async def get_scraped_data(base_urls):
|
| 81 |
-
"""
|
| 82 |
-
Automatically load scraped data from file if available;
|
| 83 |
-
otherwise, scrape and cache it.
|
| 84 |
-
"""
|
| 85 |
-
if os.path.exists(SCRAPED_DATA_PATH):
|
| 86 |
-
print("🔄 Loading scraped website contents from file...")
|
| 87 |
-
with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f:
|
| 88 |
-
return json.load(f)
|
| 89 |
-
|
| 90 |
-
print("🌍 Scraping websites...")
|
| 91 |
-
website_contents = await scrape_website(base_urls)
|
| 92 |
-
with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f:
|
| 93 |
-
json.dump(website_contents, f, ensure_ascii=False, indent=4)
|
| 94 |
-
return website_contents
|
| 95 |
-
|
| 96 |
-
async def get_vectorstore():
|
| 97 |
### Xử lý tất cả các tài liệu và nhét vào database
|
| 98 |
folder_path = "syllabus_nct_word_format/"
|
| 99 |
docx_files = list_docx_files(folder_path)
|
|
@@ -103,7 +24,7 @@ async def get_vectorstore():
|
|
| 103 |
#
|
| 104 |
base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
| 105 |
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
| 106 |
-
website_contents =
|
| 107 |
all_splits += website_contents
|
| 108 |
|
| 109 |
print('Feeding .docx files')
|
|
|
|
| 9 |
list_docx_files, # Lấy danh sách file .docx
|
| 10 |
get_splits, # Xử lý file docx thành splits
|
| 11 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
| 12 |
+
get_web_documents, # Xử lý dữ liệu từ web
|
| 13 |
)
|
| 14 |
|
|
|
|
| 15 |
import json
|
| 16 |
|
| 17 |
+
def get_vectorstore():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
### Xử lý tất cả các tài liệu và nhét vào database
|
| 19 |
folder_path = "syllabus_nct_word_format/"
|
| 20 |
docx_files = list_docx_files(folder_path)
|
|
|
|
| 24 |
#
|
| 25 |
base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
| 26 |
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
| 27 |
+
website_contents = get_web_documents(base_urls=base_urls)
|
| 28 |
all_splits += website_contents
|
| 29 |
|
| 30 |
print('Feeding .docx files')
|