Spaces:
Sleeping
Sleeping
import os | |
from tqdm import tqdm | |
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEmbeddings | |
# from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
# Import từ helpers | |
from helpers import ( | |
list_docx_files, # Lấy danh sách file .docx | |
get_splits, # Xử lý file docx thành splits | |
get_json_splits_only, # Xử lý file JSON (FAQ) | |
# scrape_website, # Xử lý dữ liệu từ web | |
) | |
import json | |
os.system("playwright install chromium") | |
import asyncio | |
from urllib.parse import urljoin | |
from playwright.async_api import async_playwright | |
from langchain_community.document_loaders import AsyncHtmlLoader | |
from langchain_community.document_transformers import Html2TextTransformer | |
from tqdm.asyncio import tqdm | |
SCRAPED_DATA_PATH = "scraped_data.json" | |
# ----------- ASYNC SCRAPING FUNCTIONS ----------- | |
async def _fetch_urls(base_url): | |
"""Extract all links from a JavaScript-rendered webpage.""" | |
urls = set() | |
try: | |
async with async_playwright() as p: | |
browser = await p.chromium.launch(headless=True) | |
page = await browser.new_page() | |
await page.goto(base_url) | |
await page.wait_for_load_state("networkidle") | |
links = await page.locator("a").all() | |
for link in links: | |
href = await link.get_attribute("href") | |
if href and "#" not in href: | |
full_url = urljoin(base_url, href) | |
if full_url.startswith(base_url): | |
urls.add(full_url) | |
await browser.close() | |
except Exception as e: | |
print(f"⚠️ Không thể truy cập {base_url}: {e}") | |
return list(urls) | |
async def _fetch_web_content(urls): | |
"""Fetch HTML content and convert it to text, with a progress bar.""" | |
docs = [] | |
progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page") | |
for page_url in urls: | |
try: | |
# Replace these with your actual async loader/transformer. | |
loader = AsyncHtmlLoader(page_url) | |
html2text = Html2TextTransformer() | |
html = await loader.aload() | |
doc = html2text.transform_documents(html) | |
docs.extend(doc) | |
except Exception as e: | |
print(f"Error loading {page_url}: {e}") | |
progress_bar.update(1) | |
progress_bar.close() | |
return docs | |
async def scrape_website(base_urls): | |
""" | |
Scrapes a list of base URLs and extracts their content. | |
""" | |
all_urls = [] | |
for base_url in base_urls: | |
urls = await _fetch_urls(base_url) | |
all_urls.extend(urls) | |
docs = await _fetch_web_content(all_urls) | |
return docs | |
async def get_scraped_data(base_urls): | |
""" | |
Automatically load scraped data from file if available; | |
otherwise, scrape and cache it. | |
""" | |
if os.path.exists(SCRAPED_DATA_PATH): | |
print("🔄 Loading scraped website contents from file...") | |
with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f: | |
return json.load(f) | |
print("🌍 Scraping websites...") | |
website_contents = await scrape_website(base_urls) | |
with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f: | |
json.dump(website_contents, f, ensure_ascii=False, indent=4) | |
return website_contents | |
async def get_vectorstore(): | |
### Xử lý tất cả các tài liệu và nhét vào database | |
folder_path = "syllabus_nct_word_format/" | |
docx_files = list_docx_files(folder_path) | |
all_splits = [] # Khởi tạo danh sách lưu kết quả | |
print("Feeding relevent websites' contents") | |
# | |
base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/'] | |
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/'] | |
website_contents = await get_scraped_data(base_urls=base_urls) | |
all_splits += website_contents | |
print('Feeding .docx files') | |
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")): | |
output_json_path = f"output_{i}.json" | |
splits = get_splits(file_path, output_json_path) | |
all_splits += splits | |
print('Feeding .json files') | |
# Xử lý FAQ | |
FAQ_path = "syllabus_nct_word_format/FAQ.json" | |
FAQ_splits = get_json_splits_only(FAQ_path) | |
all_splits += FAQ_splits | |
FAQ_path = "syllabus_nct_word_format/FAQ2.json" | |
FAQ_splits = get_json_splits_only(FAQ_path) | |
all_splits += FAQ_splits | |
# Lưu vào vectorstore với nhúng từ Google GenAI | |
# embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") | |
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2') | |
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |
print('Set vectorstore FAISS') | |
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding) | |
return vectorstore | |