Commit
·
6e567cd
1
Parent(s):
1427166
Update progress info and start welcome info (#3768)
Browse files### What problem does this PR solve?
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [x] Refactoring
---------
Signed-off-by: jinhai <[email protected]>
- api/ragflow_server.py +2 -0
- rag/app/book.py +11 -8
- rag/app/laws.py +6 -5
- rag/app/manual.py +10 -5
- rag/app/naive.py +13 -8
- rag/app/one.py +11 -6
- rag/app/paper.py +11 -6
- rag/app/presentation.py +4 -3
- rag/app/qa.py +11 -7
- rag/settings.py +7 -0
- rag/svr/task_executor.py +21 -11
api/ragflow_server.py
CHANGED
|
@@ -47,6 +47,7 @@ from api.db.db_models import init_database_tables as init_web_db
|
|
| 47 |
from api.db.init_data import init_web_data
|
| 48 |
from api.versions import get_ragflow_version
|
| 49 |
from api.utils import show_configs
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def update_progress():
|
|
@@ -75,6 +76,7 @@ if __name__ == '__main__':
|
|
| 75 |
)
|
| 76 |
show_configs()
|
| 77 |
settings.init_settings()
|
|
|
|
| 78 |
|
| 79 |
# init db
|
| 80 |
init_web_db()
|
|
|
|
| 47 |
from api.db.init_data import init_web_data
|
| 48 |
from api.versions import get_ragflow_version
|
| 49 |
from api.utils import show_configs
|
| 50 |
+
from rag.settings import print_rag_settings
|
| 51 |
|
| 52 |
|
| 53 |
def update_progress():
|
|
|
|
| 76 |
)
|
| 77 |
show_configs()
|
| 78 |
settings.init_settings()
|
| 79 |
+
print_rag_settings()
|
| 80 |
|
| 81 |
# init db
|
| 82 |
init_web_db()
|
rag/app/book.py
CHANGED
|
@@ -26,30 +26,33 @@ from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
|
| 26 |
class Pdf(PdfParser):
|
| 27 |
def __call__(self, filename, binary=None, from_page=0,
|
| 28 |
to_page=100000, zoomin=3, callback=None):
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
self.__images__(
|
| 31 |
filename if not binary else binary,
|
| 32 |
zoomin,
|
| 33 |
from_page,
|
| 34 |
to_page,
|
| 35 |
callback)
|
| 36 |
-
callback(msg="OCR finished")
|
| 37 |
|
| 38 |
-
from timeit import default_timer as timer
|
| 39 |
start = timer()
|
| 40 |
self._layouts_rec(zoomin)
|
| 41 |
-
callback(0.67, "Layout analysis
|
| 42 |
logging.debug("layouts: {}".format(timer() - start))
|
|
|
|
|
|
|
| 43 |
self._table_transformer_job(zoomin)
|
| 44 |
-
callback(0.68, "Table analysis
|
|
|
|
|
|
|
| 45 |
self._text_merge()
|
| 46 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 47 |
self._naive_vertical_merge()
|
| 48 |
self._filter_forpages()
|
| 49 |
self._merge_with_same_bullet()
|
| 50 |
-
callback(0.
|
| 51 |
-
|
| 52 |
-
callback(0.8, "Text extraction finished")
|
| 53 |
|
| 54 |
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
| 55 |
for b in self.boxes], tbls
|
|
|
|
| 26 |
class Pdf(PdfParser):
|
| 27 |
def __call__(self, filename, binary=None, from_page=0,
|
| 28 |
to_page=100000, zoomin=3, callback=None):
|
| 29 |
+
from timeit import default_timer as timer
|
| 30 |
+
start = timer()
|
| 31 |
+
callback(msg="OCR started")
|
| 32 |
self.__images__(
|
| 33 |
filename if not binary else binary,
|
| 34 |
zoomin,
|
| 35 |
from_page,
|
| 36 |
to_page,
|
| 37 |
callback)
|
| 38 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 39 |
|
|
|
|
| 40 |
start = timer()
|
| 41 |
self._layouts_rec(zoomin)
|
| 42 |
+
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 43 |
logging.debug("layouts: {}".format(timer() - start))
|
| 44 |
+
|
| 45 |
+
start = timer()
|
| 46 |
self._table_transformer_job(zoomin)
|
| 47 |
+
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 48 |
+
|
| 49 |
+
start = timer()
|
| 50 |
self._text_merge()
|
| 51 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 52 |
self._naive_vertical_merge()
|
| 53 |
self._filter_forpages()
|
| 54 |
self._merge_with_same_bullet()
|
| 55 |
+
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
|
|
|
|
|
|
| 56 |
|
| 57 |
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
| 58 |
for b in self.boxes], tbls
|
rag/app/laws.py
CHANGED
|
@@ -108,7 +108,9 @@ class Pdf(PdfParser):
|
|
| 108 |
|
| 109 |
def __call__(self, filename, binary=None, from_page=0,
|
| 110 |
to_page=100000, zoomin=3, callback=None):
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
self.__images__(
|
| 113 |
filename if not binary else binary,
|
| 114 |
zoomin,
|
|
@@ -116,17 +118,16 @@ class Pdf(PdfParser):
|
|
| 116 |
to_page,
|
| 117 |
callback
|
| 118 |
)
|
| 119 |
-
callback(msg="OCR finished")
|
| 120 |
|
| 121 |
-
from timeit import default_timer as timer
|
| 122 |
start = timer()
|
| 123 |
self._layouts_rec(zoomin)
|
| 124 |
-
callback(0.67, "Layout analysis
|
| 125 |
logging.debug("layouts:".format(
|
| 126 |
))
|
| 127 |
self._naive_vertical_merge()
|
| 128 |
|
| 129 |
-
callback(0.8, "Text extraction
|
| 130 |
|
| 131 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 132 |
for b in self.boxes], None
|
|
|
|
| 108 |
|
| 109 |
def __call__(self, filename, binary=None, from_page=0,
|
| 110 |
to_page=100000, zoomin=3, callback=None):
|
| 111 |
+
from timeit import default_timer as timer
|
| 112 |
+
start = timer()
|
| 113 |
+
callback(msg="OCR started")
|
| 114 |
self.__images__(
|
| 115 |
filename if not binary else binary,
|
| 116 |
zoomin,
|
|
|
|
| 118 |
to_page,
|
| 119 |
callback
|
| 120 |
)
|
| 121 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 122 |
|
|
|
|
| 123 |
start = timer()
|
| 124 |
self._layouts_rec(zoomin)
|
| 125 |
+
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 126 |
logging.debug("layouts:".format(
|
| 127 |
))
|
| 128 |
self._naive_vertical_merge()
|
| 129 |
|
| 130 |
+
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
| 131 |
|
| 132 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 133 |
for b in self.boxes], None
|
rag/app/manual.py
CHANGED
|
@@ -36,7 +36,7 @@ class Pdf(PdfParser):
|
|
| 36 |
to_page=100000, zoomin=3, callback=None):
|
| 37 |
from timeit import default_timer as timer
|
| 38 |
start = timer()
|
| 39 |
-
callback(msg="OCR
|
| 40 |
self.__images__(
|
| 41 |
filename if not binary else binary,
|
| 42 |
zoomin,
|
|
@@ -44,22 +44,27 @@ class Pdf(PdfParser):
|
|
| 44 |
to_page,
|
| 45 |
callback
|
| 46 |
)
|
| 47 |
-
callback(msg="OCR finished.
|
| 48 |
# for bb in self.boxes:
|
| 49 |
# for b in bb:
|
| 50 |
# print(b)
|
| 51 |
logging.debug("OCR: {}".format(timer() - start))
|
| 52 |
|
|
|
|
| 53 |
self._layouts_rec(zoomin)
|
| 54 |
-
callback(0.65, "Layout analysis
|
| 55 |
logging.debug("layouts: {}".format(timer() - start))
|
|
|
|
|
|
|
| 56 |
self._table_transformer_job(zoomin)
|
| 57 |
-
callback(0.67, "Table analysis
|
|
|
|
|
|
|
| 58 |
self._text_merge()
|
| 59 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 60 |
self._concat_downward()
|
| 61 |
self._filter_forpages()
|
| 62 |
-
callback(0.68, "Text
|
| 63 |
|
| 64 |
# clean mess
|
| 65 |
for b in self.boxes:
|
|
|
|
| 36 |
to_page=100000, zoomin=3, callback=None):
|
| 37 |
from timeit import default_timer as timer
|
| 38 |
start = timer()
|
| 39 |
+
callback(msg="OCR started")
|
| 40 |
self.__images__(
|
| 41 |
filename if not binary else binary,
|
| 42 |
zoomin,
|
|
|
|
| 44 |
to_page,
|
| 45 |
callback
|
| 46 |
)
|
| 47 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 48 |
# for bb in self.boxes:
|
| 49 |
# for b in bb:
|
| 50 |
# print(b)
|
| 51 |
logging.debug("OCR: {}".format(timer() - start))
|
| 52 |
|
| 53 |
+
start = timer()
|
| 54 |
self._layouts_rec(zoomin)
|
| 55 |
+
callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 56 |
logging.debug("layouts: {}".format(timer() - start))
|
| 57 |
+
|
| 58 |
+
start = timer()
|
| 59 |
self._table_transformer_job(zoomin)
|
| 60 |
+
callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 61 |
+
|
| 62 |
+
start = timer()
|
| 63 |
self._text_merge()
|
| 64 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 65 |
self._concat_downward()
|
| 66 |
self._filter_forpages()
|
| 67 |
+
callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
|
| 68 |
|
| 69 |
# clean mess
|
| 70 |
for b in self.boxes:
|
rag/app/naive.py
CHANGED
|
@@ -124,7 +124,8 @@ class Pdf(PdfParser):
|
|
| 124 |
def __call__(self, filename, binary=None, from_page=0,
|
| 125 |
to_page=100000, zoomin=3, callback=None):
|
| 126 |
start = timer()
|
| 127 |
-
|
|
|
|
| 128 |
self.__images__(
|
| 129 |
filename if not binary else binary,
|
| 130 |
zoomin,
|
|
@@ -132,22 +133,26 @@ class Pdf(PdfParser):
|
|
| 132 |
to_page,
|
| 133 |
callback
|
| 134 |
)
|
| 135 |
-
callback(msg="OCR finished")
|
| 136 |
-
logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
| 137 |
|
| 138 |
start = timer()
|
| 139 |
self._layouts_rec(zoomin)
|
| 140 |
-
callback(0.63, "Layout analysis
|
|
|
|
|
|
|
| 141 |
self._table_transformer_job(zoomin)
|
| 142 |
-
callback(0.65, "Table analysis
|
|
|
|
|
|
|
| 143 |
self._text_merge()
|
| 144 |
-
callback(0.67, "Text
|
| 145 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 146 |
# self._naive_vertical_merge()
|
| 147 |
self._concat_downward()
|
| 148 |
# self._filter_forpages()
|
| 149 |
|
| 150 |
-
logging.info("layouts cost: {}s".format(timer() -
|
| 151 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 152 |
for b in self.boxes], tbls
|
| 153 |
|
|
@@ -170,7 +175,7 @@ class Markdown(MarkdownParser):
|
|
| 170 |
else:
|
| 171 |
if sections and sections[-1][0].strip().find("#") == 0:
|
| 172 |
sec_, _ = sections.pop(-1)
|
| 173 |
-
sections.append((sec_+"\n"+sec, ""))
|
| 174 |
else:
|
| 175 |
sections.append((sec, ""))
|
| 176 |
|
|
|
|
| 124 |
def __call__(self, filename, binary=None, from_page=0,
|
| 125 |
to_page=100000, zoomin=3, callback=None):
|
| 126 |
start = timer()
|
| 127 |
+
first_start = start
|
| 128 |
+
callback(msg="OCR started")
|
| 129 |
self.__images__(
|
| 130 |
filename if not binary else binary,
|
| 131 |
zoomin,
|
|
|
|
| 133 |
to_page,
|
| 134 |
callback
|
| 135 |
)
|
| 136 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 137 |
+
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
| 138 |
|
| 139 |
start = timer()
|
| 140 |
self._layouts_rec(zoomin)
|
| 141 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 142 |
+
|
| 143 |
+
start = timer()
|
| 144 |
self._table_transformer_job(zoomin)
|
| 145 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 146 |
+
|
| 147 |
+
start = timer()
|
| 148 |
self._text_merge()
|
| 149 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
| 150 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 151 |
# self._naive_vertical_merge()
|
| 152 |
self._concat_downward()
|
| 153 |
# self._filter_forpages()
|
| 154 |
|
| 155 |
+
logging.info("layouts cost: {}s".format(timer() - first_start))
|
| 156 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 157 |
for b in self.boxes], tbls
|
| 158 |
|
|
|
|
| 175 |
else:
|
| 176 |
if sections and sections[-1][0].strip().find("#") == 0:
|
| 177 |
sec_, _ = sections.pop(-1)
|
| 178 |
+
sections.append((sec_ + "\n" + sec, ""))
|
| 179 |
else:
|
| 180 |
sections.append((sec, ""))
|
| 181 |
|
rag/app/one.py
CHANGED
|
@@ -24,7 +24,9 @@ from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
|
|
| 24 |
class Pdf(PdfParser):
|
| 25 |
def __call__(self, filename, binary=None, from_page=0,
|
| 26 |
to_page=100000, zoomin=3, callback=None):
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
self.__images__(
|
| 29 |
filename if not binary else binary,
|
| 30 |
zoomin,
|
|
@@ -32,17 +34,20 @@ class Pdf(PdfParser):
|
|
| 32 |
to_page,
|
| 33 |
callback
|
| 34 |
)
|
| 35 |
-
callback(msg="OCR finished")
|
| 36 |
|
| 37 |
-
from timeit import default_timer as timer
|
| 38 |
start = timer()
|
| 39 |
self._layouts_rec(zoomin, drop=False)
|
| 40 |
-
callback(0.63, "Layout analysis
|
| 41 |
logging.debug("layouts cost: {}s".format(timer() - start))
|
|
|
|
|
|
|
| 42 |
self._table_transformer_job(zoomin)
|
| 43 |
-
callback(0.65, "Table analysis
|
|
|
|
|
|
|
| 44 |
self._text_merge()
|
| 45 |
-
callback(0.67, "Text
|
| 46 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 47 |
self._concat_downward()
|
| 48 |
|
|
|
|
| 24 |
class Pdf(PdfParser):
|
| 25 |
def __call__(self, filename, binary=None, from_page=0,
|
| 26 |
to_page=100000, zoomin=3, callback=None):
|
| 27 |
+
from timeit import default_timer as timer
|
| 28 |
+
start = timer()
|
| 29 |
+
callback(msg="OCR started")
|
| 30 |
self.__images__(
|
| 31 |
filename if not binary else binary,
|
| 32 |
zoomin,
|
|
|
|
| 34 |
to_page,
|
| 35 |
callback
|
| 36 |
)
|
| 37 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 38 |
|
|
|
|
| 39 |
start = timer()
|
| 40 |
self._layouts_rec(zoomin, drop=False)
|
| 41 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 42 |
logging.debug("layouts cost: {}s".format(timer() - start))
|
| 43 |
+
|
| 44 |
+
start = timer()
|
| 45 |
self._table_transformer_job(zoomin)
|
| 46 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 47 |
+
|
| 48 |
+
start = timer()
|
| 49 |
self._text_merge()
|
| 50 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
| 51 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 52 |
self._concat_downward()
|
| 53 |
|
rag/app/paper.py
CHANGED
|
@@ -27,7 +27,9 @@ class Pdf(PdfParser):
|
|
| 27 |
|
| 28 |
def __call__(self, filename, binary=None, from_page=0,
|
| 29 |
to_page=100000, zoomin=3, callback=None):
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
self.__images__(
|
| 32 |
filename if not binary else binary,
|
| 33 |
zoomin,
|
|
@@ -35,21 +37,24 @@ class Pdf(PdfParser):
|
|
| 35 |
to_page,
|
| 36 |
callback
|
| 37 |
)
|
| 38 |
-
callback(msg="OCR finished.
|
| 39 |
|
| 40 |
-
from timeit import default_timer as timer
|
| 41 |
start = timer()
|
| 42 |
self._layouts_rec(zoomin)
|
| 43 |
-
callback(0.63, "Layout analysis
|
| 44 |
logging.debug(f"layouts cost: {timer() - start}s")
|
|
|
|
|
|
|
| 45 |
self._table_transformer_job(zoomin)
|
| 46 |
-
callback(0.68, "Table analysis
|
|
|
|
|
|
|
| 47 |
self._text_merge()
|
| 48 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 49 |
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
| 50 |
self._concat_downward()
|
| 51 |
self._filter_forpages()
|
| 52 |
-
callback(0.75, "Text
|
| 53 |
|
| 54 |
# clean mess
|
| 55 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
|
|
|
| 27 |
|
| 28 |
def __call__(self, filename, binary=None, from_page=0,
|
| 29 |
to_page=100000, zoomin=3, callback=None):
|
| 30 |
+
from timeit import default_timer as timer
|
| 31 |
+
start = timer()
|
| 32 |
+
callback(msg="OCR started")
|
| 33 |
self.__images__(
|
| 34 |
filename if not binary else binary,
|
| 35 |
zoomin,
|
|
|
|
| 37 |
to_page,
|
| 38 |
callback
|
| 39 |
)
|
| 40 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 41 |
|
|
|
|
| 42 |
start = timer()
|
| 43 |
self._layouts_rec(zoomin)
|
| 44 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 45 |
logging.debug(f"layouts cost: {timer() - start}s")
|
| 46 |
+
|
| 47 |
+
start = timer()
|
| 48 |
self._table_transformer_job(zoomin)
|
| 49 |
+
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 50 |
+
|
| 51 |
+
start = timer()
|
| 52 |
self._text_merge()
|
| 53 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 54 |
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
| 55 |
self._concat_downward()
|
| 56 |
self._filter_forpages()
|
| 57 |
+
callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
|
| 58 |
|
| 59 |
# clean mess
|
| 60 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
rag/app/presentation.py
CHANGED
|
@@ -59,11 +59,12 @@ class Pdf(PdfParser):
|
|
| 59 |
|
| 60 |
def __call__(self, filename, binary=None, from_page=0,
|
| 61 |
to_page=100000, zoomin=3, callback=None):
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
self.__images__(filename if not binary else binary,
|
| 64 |
zoomin, from_page, to_page, callback)
|
| 65 |
-
callback(
|
| 66 |
-
from_page, min(to_page, self.total_page)))
|
| 67 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
| 68 |
len(self.boxes), len(self.page_images))
|
| 69 |
res = []
|
|
|
|
| 59 |
|
| 60 |
def __call__(self, filename, binary=None, from_page=0,
|
| 61 |
to_page=100000, zoomin=3, callback=None):
|
| 62 |
+
from timeit import default_timer as timer
|
| 63 |
+
start = timer()
|
| 64 |
+
callback(msg="OCR started")
|
| 65 |
self.__images__(filename if not binary else binary,
|
| 66 |
zoomin, from_page, to_page, callback)
|
| 67 |
+
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
|
|
|
|
| 68 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
| 69 |
len(self.boxes), len(self.page_images))
|
| 70 |
res = []
|
rag/app/qa.py
CHANGED
|
@@ -73,7 +73,7 @@ class Pdf(PdfParser):
|
|
| 73 |
def __call__(self, filename, binary=None, from_page=0,
|
| 74 |
to_page=100000, zoomin=3, callback=None):
|
| 75 |
start = timer()
|
| 76 |
-
callback(msg="OCR
|
| 77 |
self.__images__(
|
| 78 |
filename if not binary else binary,
|
| 79 |
zoomin,
|
|
@@ -81,15 +81,19 @@ class Pdf(PdfParser):
|
|
| 81 |
to_page,
|
| 82 |
callback
|
| 83 |
)
|
| 84 |
-
callback(msg="OCR finished")
|
| 85 |
-
logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
| 86 |
start = timer()
|
| 87 |
self._layouts_rec(zoomin, drop=False)
|
| 88 |
-
callback(0.63, "Layout analysis
|
|
|
|
|
|
|
| 89 |
self._table_transformer_job(zoomin)
|
| 90 |
-
callback(0.65, "Table analysis
|
|
|
|
|
|
|
| 91 |
self._text_merge()
|
| 92 |
-
callback(0.67, "Text
|
| 93 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 94 |
#self._naive_vertical_merge()
|
| 95 |
# self._concat_downward()
|
|
@@ -226,7 +230,7 @@ class Docx(DocxParser):
|
|
| 226 |
sum_question = '\n'.join(question_stack)
|
| 227 |
if sum_question:
|
| 228 |
qai_list.append((sum_question, last_answer, last_image))
|
| 229 |
-
|
| 230 |
tbls = []
|
| 231 |
for tb in self.doc.tables:
|
| 232 |
html= "<table>"
|
|
|
|
| 73 |
def __call__(self, filename, binary=None, from_page=0,
|
| 74 |
to_page=100000, zoomin=3, callback=None):
|
| 75 |
start = timer()
|
| 76 |
+
callback(msg="OCR started")
|
| 77 |
self.__images__(
|
| 78 |
filename if not binary else binary,
|
| 79 |
zoomin,
|
|
|
|
| 81 |
to_page,
|
| 82 |
callback
|
| 83 |
)
|
| 84 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
| 85 |
+
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
| 86 |
start = timer()
|
| 87 |
self._layouts_rec(zoomin, drop=False)
|
| 88 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
| 89 |
+
|
| 90 |
+
start = timer()
|
| 91 |
self._table_transformer_job(zoomin)
|
| 92 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
| 93 |
+
|
| 94 |
+
start = timer()
|
| 95 |
self._text_merge()
|
| 96 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
| 97 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
| 98 |
#self._naive_vertical_merge()
|
| 99 |
# self._concat_downward()
|
|
|
|
| 230 |
sum_question = '\n'.join(question_stack)
|
| 231 |
if sum_question:
|
| 232 |
qai_list.append((sum_question, last_answer, last_image))
|
| 233 |
+
|
| 234 |
tbls = []
|
| 235 |
for tb in self.doc.tables:
|
| 236 |
html= "<table>"
|
rag/settings.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
import os
|
|
|
|
| 17 |
from api.utils import get_base_config, decrypt_database_config
|
| 18 |
from api.utils.file_utils import get_project_base_directory
|
| 19 |
|
|
@@ -37,3 +38,9 @@ SVR_QUEUE_RETENTION = 60*60
|
|
| 37 |
SVR_QUEUE_MAX_LEN = 1024
|
| 38 |
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
| 39 |
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
import os
|
| 17 |
+
import logging
|
| 18 |
from api.utils import get_base_config, decrypt_database_config
|
| 19 |
from api.utils.file_utils import get_project_base_directory
|
| 20 |
|
|
|
|
| 38 |
SVR_QUEUE_MAX_LEN = 1024
|
| 39 |
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
| 40 |
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
| 41 |
+
|
| 42 |
+
def print_rag_settings():
|
| 43 |
+
logging.info(f"MAX_CONTENT_LENGTH: {DOC_MAXIMUM_SIZE}")
|
| 44 |
+
logging.info(f"SERVER_QUEUE_MAX_LEN: {SVR_QUEUE_MAX_LEN}")
|
| 45 |
+
logging.info(f"SERVER_QUEUE_RETENTION: {SVR_QUEUE_RETENTION}")
|
| 46 |
+
logging.info(f"MAX_FILE_COUNT_PER_USER: {int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))}")
|
rag/svr/task_executor.py
CHANGED
|
@@ -56,12 +56,13 @@ from api.db.services.llm_service import LLMBundle
|
|
| 56 |
from api.db.services.task_service import TaskService
|
| 57 |
from api.db.services.file2document_service import File2DocumentService
|
| 58 |
from api import settings
|
|
|
|
| 59 |
from api.db.db_models import close_connection
|
| 60 |
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
|
| 61 |
knowledge_graph, email
|
| 62 |
from rag.nlp import search, rag_tokenizer
|
| 63 |
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
| 64 |
-
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
|
| 65 |
from rag.utils import rmSpace, num_tokens_from_string
|
| 66 |
from rag.utils.redis_conn import REDIS_CONN, Payload
|
| 67 |
from rag.utils.storage_factory import STORAGE_IMPL
|
|
@@ -395,8 +396,7 @@ def do_handle_task(r):
|
|
| 395 |
# TODO: exception handler
|
| 396 |
## set_progress(r["did"], -1, "ERROR: ")
|
| 397 |
callback(
|
| 398 |
-
|
| 399 |
-
timer() - st)
|
| 400 |
)
|
| 401 |
st = timer()
|
| 402 |
try:
|
|
@@ -407,7 +407,7 @@ def do_handle_task(r):
|
|
| 407 |
tk_count = 0
|
| 408 |
raise
|
| 409 |
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 410 |
-
callback(msg="Finished embedding (
|
| 411 |
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
| 412 |
init_kb(r, vector_size)
|
| 413 |
chunk_count = len(set([c["id"] for c in cks]))
|
|
@@ -420,7 +420,8 @@ def do_handle_task(r):
|
|
| 420 |
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
| 421 |
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 422 |
if es_r:
|
| 423 |
-
callback(-1,
|
|
|
|
| 424 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 425 |
logging.error('Insert chunk error: ' + str(es_r))
|
| 426 |
raise Exception('Insert chunk error: ' + str(es_r))
|
|
@@ -429,13 +430,12 @@ def do_handle_task(r):
|
|
| 429 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 430 |
return
|
| 431 |
|
| 432 |
-
callback(msg="
|
| 433 |
-
callback(1., "Done!")
|
| 434 |
DocumentService.increment_chunk_num(
|
| 435 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
| 436 |
logging.info(
|
| 437 |
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
| 438 |
-
|
| 439 |
|
| 440 |
|
| 441 |
def handle_task():
|
|
@@ -502,7 +502,7 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
|
|
| 502 |
for stat in stats2[:10]:
|
| 503 |
msg += f"{stat}\n"
|
| 504 |
stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
|
| 505 |
-
msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id-1} to snapshot {snapshot_id}:\n"
|
| 506 |
for stat in stats1_vs_2[:10]:
|
| 507 |
msg += f"{stat}\n"
|
| 508 |
msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
|
|
@@ -512,7 +512,16 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
|
|
| 512 |
|
| 513 |
|
| 514 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
settings.init_settings()
|
|
|
|
| 516 |
background_thread = threading.Thread(target=report_status)
|
| 517 |
background_thread.daemon = True
|
| 518 |
background_thread.start()
|
|
@@ -527,11 +536,12 @@ def main():
|
|
| 527 |
while True:
|
| 528 |
handle_task()
|
| 529 |
num_tasks = DONE_TASKS + FAILED_TASKS
|
| 530 |
-
if TRACE_MALLOC_DELTA> 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
|
| 531 |
snapshot2 = tracemalloc.take_snapshot()
|
| 532 |
-
analyze_heap(snapshot1, snapshot2, int(num_tasks/TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
|
| 533 |
snapshot1 = snapshot2
|
| 534 |
snapshot2 = None
|
| 535 |
|
|
|
|
| 536 |
if __name__ == "__main__":
|
| 537 |
main()
|
|
|
|
| 56 |
from api.db.services.task_service import TaskService
|
| 57 |
from api.db.services.file2document_service import File2DocumentService
|
| 58 |
from api import settings
|
| 59 |
+
from api.versions import get_ragflow_version
|
| 60 |
from api.db.db_models import close_connection
|
| 61 |
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
|
| 62 |
knowledge_graph, email
|
| 63 |
from rag.nlp import search, rag_tokenizer
|
| 64 |
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
| 65 |
+
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME, print_rag_settings
|
| 66 |
from rag.utils import rmSpace, num_tokens_from_string
|
| 67 |
from rag.utils.redis_conn import REDIS_CONN, Payload
|
| 68 |
from rag.utils.storage_factory import STORAGE_IMPL
|
|
|
|
| 396 |
# TODO: exception handler
|
| 397 |
## set_progress(r["did"], -1, "ERROR: ")
|
| 398 |
callback(
|
| 399 |
+
msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st)
|
|
|
|
| 400 |
)
|
| 401 |
st = timer()
|
| 402 |
try:
|
|
|
|
| 407 |
tk_count = 0
|
| 408 |
raise
|
| 409 |
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 410 |
+
callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st))
|
| 411 |
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
| 412 |
init_kb(r, vector_size)
|
| 413 |
chunk_count = len(set([c["id"] for c in cks]))
|
|
|
|
| 420 |
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
| 421 |
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 422 |
if es_r:
|
| 423 |
+
callback(-1,
|
| 424 |
+
"Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
|
| 425 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 426 |
logging.error('Insert chunk error: ' + str(es_r))
|
| 427 |
raise Exception('Insert chunk error: ' + str(es_r))
|
|
|
|
| 430 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 431 |
return
|
| 432 |
|
| 433 |
+
callback(1., msg="Index cost {:.2f}s.".format(timer() - st))
|
|
|
|
| 434 |
DocumentService.increment_chunk_num(
|
| 435 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
| 436 |
logging.info(
|
| 437 |
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
| 438 |
+
r["id"], tk_count, len(cks), timer() - st))
|
| 439 |
|
| 440 |
|
| 441 |
def handle_task():
|
|
|
|
| 502 |
for stat in stats2[:10]:
|
| 503 |
msg += f"{stat}\n"
|
| 504 |
stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
|
| 505 |
+
msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id - 1} to snapshot {snapshot_id}:\n"
|
| 506 |
for stat in stats1_vs_2[:10]:
|
| 507 |
msg += f"{stat}\n"
|
| 508 |
msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
|
|
|
|
| 512 |
|
| 513 |
|
| 514 |
def main():
|
| 515 |
+
logging.info(r"""
|
| 516 |
+
______ __ ______ __
|
| 517 |
+
/_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____
|
| 518 |
+
/ / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
|
| 519 |
+
/ / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / /
|
| 520 |
+
/_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/
|
| 521 |
+
""")
|
| 522 |
+
logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
|
| 523 |
settings.init_settings()
|
| 524 |
+
print_rag_settings()
|
| 525 |
background_thread = threading.Thread(target=report_status)
|
| 526 |
background_thread.daemon = True
|
| 527 |
background_thread.start()
|
|
|
|
| 536 |
while True:
|
| 537 |
handle_task()
|
| 538 |
num_tasks = DONE_TASKS + FAILED_TASKS
|
| 539 |
+
if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
|
| 540 |
snapshot2 = tracemalloc.take_snapshot()
|
| 541 |
+
analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
|
| 542 |
snapshot1 = snapshot2
|
| 543 |
snapshot2 = None
|
| 544 |
|
| 545 |
+
|
| 546 |
if __name__ == "__main__":
|
| 547 |
main()
|