Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +13 -11
helpers.py
CHANGED
|
@@ -16,6 +16,8 @@ import shutil
|
|
| 16 |
import requests
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
import os
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# from file_loader import get_vectorstore
|
| 21 |
if "GOOGLE_API_KEY" not in os.environ:
|
|
@@ -94,11 +96,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
|
| 94 |
docs = []
|
| 95 |
for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
|
| 96 |
try:
|
| 97 |
-
loader =
|
| 98 |
-
html2text = Html2TextTransformer()
|
| 99 |
|
| 100 |
html = loader.load()
|
| 101 |
-
doc = html2text.transform_documents(html)
|
| 102 |
docs.extend(doc)
|
| 103 |
except Exception as e:
|
| 104 |
print(f"Lỗi khi tải {page_url}: {e}")
|
|
@@ -132,8 +134,8 @@ def remove_tables_from_docx(file_path):
|
|
| 132 |
|
| 133 |
def load_text_data(file_path):
|
| 134 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
| 135 |
-
cleaned_file = remove_tables_from_docx(file_path)
|
| 136 |
-
return
|
| 137 |
|
| 138 |
|
| 139 |
def extract_tables_from_docx(file_path):
|
|
@@ -210,17 +212,17 @@ def load_table_data(file_path, output_json_path):
|
|
| 210 |
return table_data
|
| 211 |
|
| 212 |
def get_splits(file_path, output_json_path):
|
| 213 |
-
table_data = load_table_data(file_path, output_json_path)
|
| 214 |
-
text_data = load_text_data(file_path)
|
| 215 |
|
| 216 |
# Chia nhỏ văn bản
|
| 217 |
-
json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
| 218 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
| 219 |
|
| 220 |
-
table_splits = json_splitter.create_documents(texts=[table_data])
|
| 221 |
text_splits = text_splitter.split_documents(text_data)
|
| 222 |
-
all_splits = table_splits + text_splits
|
| 223 |
-
return
|
| 224 |
|
| 225 |
def get_json_splits_only(file_path):
|
| 226 |
table_data = load_json_manually(file_path)
|
|
|
|
| 16 |
import requests
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
import os
|
| 19 |
+
from langchain_docling import DoclingLoader
|
| 20 |
+
|
| 21 |
|
| 22 |
# from file_loader import get_vectorstore
|
| 23 |
if "GOOGLE_API_KEY" not in os.environ:
|
|
|
|
| 96 |
docs = []
|
| 97 |
for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
|
| 98 |
try:
|
| 99 |
+
loader = DoclingLoader(page_url)
|
| 100 |
+
# html2text = Html2TextTransformer()
|
| 101 |
|
| 102 |
html = loader.load()
|
| 103 |
+
doc = html#html2text.transform_documents(html)
|
| 104 |
docs.extend(doc)
|
| 105 |
except Exception as e:
|
| 106 |
print(f"Lỗi khi tải {page_url}: {e}")
|
|
|
|
| 134 |
|
| 135 |
def load_text_data(file_path):
|
| 136 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
| 137 |
+
cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
| 138 |
+
return DoclingLoader(cleaned_file).load()
|
| 139 |
|
| 140 |
|
| 141 |
def extract_tables_from_docx(file_path):
|
|
|
|
| 212 |
return table_data
|
| 213 |
|
| 214 |
def get_splits(file_path, output_json_path):
|
| 215 |
+
# table_data = load_table_data(file_path, output_json_path)
|
| 216 |
+
# text_data = load_text_data(file_path)
|
| 217 |
|
| 218 |
# Chia nhỏ văn bản
|
| 219 |
+
# json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
| 220 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
| 221 |
|
| 222 |
+
# table_splits = json_splitter.create_documents(texts=[table_data])
|
| 223 |
text_splits = text_splitter.split_documents(text_data)
|
| 224 |
+
# all_splits = table_splits + text_splits DoclingLoader
|
| 225 |
+
return text_splits
|
| 226 |
|
| 227 |
def get_json_splits_only(file_path):
|
| 228 |
table_data = load_json_manually(file_path)
|