Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +55 -19
helpers.py
CHANGED
|
@@ -16,7 +16,7 @@ import shutil
|
|
| 16 |
import requests
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
import os
|
| 19 |
-
from langchain_docling import DoclingLoader
|
| 20 |
|
| 21 |
|
| 22 |
# from file_loader import get_vectorstore
|
|
@@ -91,24 +91,67 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
|
| 91 |
|
| 92 |
# return asyncio.run(_main)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
| 95 |
-
"""
|
| 96 |
-
docs = []
|
| 97 |
-
|
| 98 |
-
for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
| 99 |
try:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
doc =
|
| 105 |
docs.extend(doc)
|
| 106 |
except Exception as e:
|
| 107 |
-
print(f"
|
| 108 |
|
| 109 |
-
print(f"
|
| 110 |
return docs
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
def log_message(messages, filename="chat_log.txt"):
|
| 114 |
"""Ghi lịch sử tin nhắn vào file log"""
|
|
@@ -134,13 +177,6 @@ def remove_tables_from_docx(file_path):
|
|
| 134 |
|
| 135 |
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
|
| 136 |
|
| 137 |
-
def load_text_data(file_path):
|
| 138 |
-
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
| 139 |
-
# cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
| 140 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
| 141 |
-
return DoclingLoader(file_path=file_path, chunker=text_splitter # This will break your doc into manageable pieces.
|
| 142 |
-
).load()
|
| 143 |
-
|
| 144 |
|
| 145 |
def extract_tables_from_docx(file_path):
|
| 146 |
doc = Document(file_path)
|
|
|
|
| 16 |
import requests
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
import os
|
| 19 |
+
from langchain_docling import DoclingLoader, ExportType
|
| 20 |
|
| 21 |
|
| 22 |
# from file_loader import get_vectorstore
|
|
|
|
| 91 |
|
| 92 |
# return asyncio.run(_main)
|
| 93 |
|
| 94 |
+
# class ChunkerWrapper:
|
| 95 |
+
# def __init__(self, splitter):
|
| 96 |
+
# self.splitter = splitter
|
| 97 |
+
|
| 98 |
+
# def chunk(self, text):
|
| 99 |
+
# # Use the 'split_text' method of the splitter to divide the text
|
| 100 |
+
# return self.splitter.split_text(text)
|
| 101 |
+
|
| 102 |
+
# def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
| 103 |
+
# """Tải nội dung từ danh sách URL với thanh tiến trình"""
|
| 104 |
+
# docs = []
|
| 105 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
| 106 |
+
# chunker = ChunkerWrapper(text_splitter)
|
| 107 |
+
# for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
| 108 |
+
# try:
|
| 109 |
+
# # loader = WebBaseLoader(page_url)
|
| 110 |
+
# loader = DoclingLoader(file_path=page_url,chunker=chunker # This will break your doc into manageable pieces.
|
| 111 |
+
# )
|
| 112 |
+
# html = loader.load()
|
| 113 |
+
# doc = html
|
| 114 |
+
# docs.extend(doc)
|
| 115 |
+
# except Exception as e:
|
| 116 |
+
# print(f"Lỗi khi tải {page_url}: {e}")
|
| 117 |
+
|
| 118 |
+
# print(f"Tải thành công {len(docs)} trang.")
|
| 119 |
+
# return docs
|
| 120 |
+
|
| 121 |
+
# def load_text_data(file_path):
|
| 122 |
+
# """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
| 123 |
+
# # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
| 124 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
| 125 |
+
# chunker = ChunkerWrapper(text_splitter)
|
| 126 |
+
# return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
|
| 127 |
+
# ).load()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
| 131 |
+
"""Fetch content from a list of URLs with a progress bar."""
|
| 132 |
+
docs = []
|
| 133 |
+
for page_url in tqdm(base_urls, desc="Loading page", unit="url"):
|
|
|
|
| 134 |
try:
|
| 135 |
+
loader = DoclingLoader(
|
| 136 |
+
file_path=page_url,
|
| 137 |
+
export_type=ExportType.DOC_CHUNKS # Enable internal chunking
|
| 138 |
+
)
|
| 139 |
+
doc = loader.load()
|
| 140 |
docs.extend(doc)
|
| 141 |
except Exception as e:
|
| 142 |
+
print(f"Error loading {page_url}: {e}")
|
| 143 |
|
| 144 |
+
print(f"Successfully loaded {len(docs)} documents.")
|
| 145 |
return docs
|
| 146 |
|
| 147 |
+
def load_text_data(file_path):
|
| 148 |
+
"""Load text content from a DOCX file (tables removed)."""
|
| 149 |
+
loader = DoclingLoader(
|
| 150 |
+
file_path=file_path,
|
| 151 |
+
export_type=ExportType.DOC_CHUNKS # Enable internal chunking
|
| 152 |
+
)
|
| 153 |
+
return loader.load()
|
| 154 |
+
|
| 155 |
|
| 156 |
def log_message(messages, filename="chat_log.txt"):
|
| 157 |
"""Ghi lịch sử tin nhắn vào file log"""
|
|
|
|
| 177 |
|
| 178 |
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
def extract_tables_from_docx(file_path):
|
| 182 |
doc = Document(file_path)
|