fix chunk method "Table" losing content when the Excel file has multi… (#4123)
Browse files…ple sheets
### What problem does this PR solve?
discussed in https://github.com/infiniflow/ragflow/pull/4102
- In excel_parser.py, `total` means the total number of rows in Excel,
but it return in the first iterate, that lead to the wrong `to_page`
- In table.py, it when Excel file has multiple sheets, it will be
divided into multiple parts, every part size is 3000, `data` may be
empty, because it has recorded in the last iterate.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- deepdoc/parser/excel_parser.py +1 -1
- rag/app/table.py +2 -0
deepdoc/parser/excel_parser.py
CHANGED
@@ -90,7 +90,7 @@ class RAGFlowExcelParser:
|
|
90 |
for sheetname in wb.sheetnames:
|
91 |
ws = wb[sheetname]
|
92 |
total += len(list(ws.rows))
|
93 |
-
|
94 |
|
95 |
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
96 |
encoding = find_codec(binary)
|
|
|
90 |
for sheetname in wb.sheetnames:
|
91 |
ws = wb[sheetname]
|
92 |
total += len(list(ws.rows))
|
93 |
+
return total
|
94 |
|
95 |
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
96 |
encoding = find_codec(binary)
|
rag/app/table.py
CHANGED
@@ -66,6 +66,8 @@ class Excel(ExcelParser):
|
|
66 |
continue
|
67 |
data.append(row)
|
68 |
done += 1
|
|
|
|
|
69 |
res.append(pd.DataFrame(np.array(data), columns=headers))
|
70 |
|
71 |
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
|
|
|
66 |
continue
|
67 |
data.append(row)
|
68 |
done += 1
|
69 |
+
if np.array(data).size == 0:
|
70 |
+
continue
|
71 |
res.append(pd.DataFrame(np.array(data), columns=headers))
|
72 |
|
73 |
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
|