Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Feb 26

Commit

53833ab

verified ·

1 Parent(s): 56d2620

Update helpers.py

Browse files

Files changed (1) hide show

helpers.py +13 -11

helpers.py CHANGED Viewed

@@ -16,6 +16,8 @@ import shutil
 import requests
 from bs4 import BeautifulSoup
 import os
 # from file_loader import get_vectorstore
 if "GOOGLE_API_KEY" not in os.environ:
@@ -94,11 +96,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
     docs = []
     for page_url in base_urls:  # Kiểm tra biến urls có được định nghĩa trước đó không
         try:
-            loader = AsyncHtmlLoader(page_url)
-            html2text = Html2TextTransformer()
             html = loader.load()
-            doc = html2text.transform_documents(html)
             docs.extend(doc)
         except Exception as e:
             print(f"Lỗi khi tải {page_url}: {e}")
@@ -132,8 +134,8 @@ def remove_tables_from_docx(file_path):
 def load_text_data(file_path):
     """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
-    cleaned_file = remove_tables_from_docx(file_path)
-    return UnstructuredWordDocumentLoader(cleaned_file).load()
 def extract_tables_from_docx(file_path):
@@ -210,17 +212,17 @@ def load_table_data(file_path, output_json_path):
     return table_data
 def get_splits(file_path, output_json_path):
-    table_data = load_table_data(file_path, output_json_path)
-    text_data = load_text_data(file_path)
     # Chia nhỏ văn bản
-    json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
-    table_splits = json_splitter.create_documents(texts=[table_data])
     text_splits = text_splitter.split_documents(text_data)
-    all_splits = table_splits + text_splits
-    return all_splits
 def get_json_splits_only(file_path):
     table_data = load_json_manually(file_path)

 import requests
 from bs4 import BeautifulSoup
 import os
+from langchain_docling import DoclingLoader
 # from file_loader import get_vectorstore
 if "GOOGLE_API_KEY" not in os.environ:
     docs = []
     for page_url in base_urls:  # Kiểm tra biến urls có được định nghĩa trước đó không
         try:
+            loader = DoclingLoader(page_url)
+            # html2text = Html2TextTransformer()
             html = loader.load()
+            doc = html#html2text.transform_documents(html)
             docs.extend(doc)
         except Exception as e:
             print(f"Lỗi khi tải {page_url}: {e}")
 def load_text_data(file_path):
     """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
+    cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
+    return DoclingLoader(cleaned_file).load()
 def extract_tables_from_docx(file_path):
     return table_data
 def get_splits(file_path, output_json_path):
+    # table_data = load_table_data(file_path, output_json_path)
+    # text_data = load_text_data(file_path)
     # Chia nhỏ văn bản
+    # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
+    # table_splits = json_splitter.create_documents(texts=[table_data])
     text_splits = text_splitter.split_documents(text_data)
+    # all_splits = table_splits + text_splits DoclingLoader
+    return text_splits
 def get_json_splits_only(file_path):
     table_data = load_json_manually(file_path)