Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +3 -1
helpers.py
CHANGED
|
@@ -147,9 +147,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
|
| 147 |
|
| 148 |
def load_text_data(file_path):
|
| 149 |
"""Load text content from a DOCX file (tables removed)."""
|
|
|
|
| 150 |
loader = DoclingLoader(
|
| 151 |
file_path=file_path,
|
| 152 |
-
export_type=ExportType.
|
|
|
|
| 153 |
)
|
| 154 |
return loader.load()
|
| 155 |
|
|
|
|
| 147 |
|
| 148 |
def load_text_data(file_path):
|
| 149 |
"""Load text content from a DOCX file (tables removed)."""
|
| 150 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
| 151 |
loader = DoclingLoader(
|
| 152 |
file_path=file_path,
|
| 153 |
+
export_type=ExportType.MARKDOWN, # Enable internal chunking,
|
| 154 |
+
chunker = text_splitter
|
| 155 |
)
|
| 156 |
return loader.load()
|
| 157 |
|