Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Feb 26

Commit

49ecf68

verified ·

1 Parent(s): b65f639

Update helpers.py

Browse files

Files changed (1) hide show

helpers.py +55 -19

helpers.py CHANGED Viewed

@@ -16,7 +16,7 @@ import shutil
 import requests
 from bs4 import BeautifulSoup
 import os
-from langchain_docling import DoclingLoader
 # from file_loader import get_vectorstore
@@ -91,24 +91,67 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
 #     return asyncio.run(_main)
 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
-    """Tải nội dung từ danh sách URL với thanh tiến trình"""
-    docs = []
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
-    for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
         try:
-            # loader = WebBaseLoader(page_url)
-            loader = DoclingLoader(file_path=page_url,chunker=text_splitter  # This will break your doc into manageable pieces.
-                                    )
-            html = loader.load()
-            doc = html
             docs.extend(doc)
         except Exception as e:
-            print(f"Lỗi khi tải {page_url}: {e}")
-    print(f"Tải thành công {len(docs)} trang.")
     return docs
 def log_message(messages, filename="chat_log.txt"):
     """Ghi lịch sử tin nhắn vào file log"""
@@ -134,13 +177,6 @@ def remove_tables_from_docx(file_path):
     return temp_path  # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
-def load_text_data(file_path):
-    """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
-    # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
-    return DoclingLoader(file_path=file_path, chunker=text_splitter  # This will break your doc into manageable pieces.
-                        ).load()
 def extract_tables_from_docx(file_path):
     doc = Document(file_path)

 import requests
 from bs4 import BeautifulSoup
 import os
+from langchain_docling import DoclingLoader, ExportType
 # from file_loader import get_vectorstore
 #     return asyncio.run(_main)
+# class ChunkerWrapper:
+#     def __init__(self, splitter):
+#         self.splitter = splitter
+#     def chunk(self, text):
+#         # Use the 'split_text' method of the splitter to divide the text
+#         return self.splitter.split_text(text)
+# def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
+#     """Tải nội dung từ danh sách URL với thanh tiến trình"""
+#     docs = []
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
+#     chunker = ChunkerWrapper(text_splitter)
+#     for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
+#         try:
+#             # loader = WebBaseLoader(page_url)
+#             loader = DoclingLoader(file_path=page_url,chunker=chunker  # This will break your doc into manageable pieces.
+#                                     )
+#             html = loader.load()
+#             doc = html
+#             docs.extend(doc)
+#         except Exception as e:
+#             print(f"Lỗi khi tải {page_url}: {e}")
+#     print(f"Tải thành công {len(docs)} trang.")
+#     return docs
+# def load_text_data(file_path):
+#     """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
+#     # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
+#     chunker = ChunkerWrapper(text_splitter)
+#     return DoclingLoader(file_path=file_path, chunker=chunker  # This will break your doc into manageable pieces.
+#                         ).load()
 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
+    """Fetch content from a list of URLs with a progress bar."""
+    docs = []
+    for page_url in tqdm(base_urls, desc="Loading page", unit="url"):
         try:
+            loader = DoclingLoader(
+                file_path=page_url,
+                export_type=ExportType.DOC_CHUNKS  # Enable internal chunking
+            )
+            doc = loader.load()
             docs.extend(doc)
         except Exception as e:
+            print(f"Error loading {page_url}: {e}")
+    print(f"Successfully loaded {len(docs)} documents.")
     return docs
+def load_text_data(file_path):
+    """Load text content from a DOCX file (tables removed)."""
+    loader = DoclingLoader(
+        file_path=file_path,
+        export_type=ExportType.DOC_CHUNKS  # Enable internal chunking
+    )
+    return loader.load()
 def log_message(messages, filename="chat_log.txt"):
     """Ghi lịch sử tin nhắn vào file log"""
     return temp_path  # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
 def extract_tables_from_docx(file_path):
     doc = Document(file_path)