KevinHuSh commited on
Commit
b83edb4
·
1 Parent(s): 830bf29

change callback strategy, add timezone to docker (#96)

Browse files
.gitignore CHANGED
@@ -20,5 +20,4 @@ Cargo.lock
20
  *.trie
21
 
22
  .idea/
23
- .env
24
  .vscode/
 
20
  *.trie
21
 
22
  .idea/
 
23
  .vscode/
api/apps/document_app.py CHANGED
@@ -141,7 +141,7 @@ def list():
141
  try:
142
  docs, tol = DocumentService.get_by_kb_id(
143
  kb_id, page_number, items_per_page, orderby, desc, keywords)
144
- return get_json_result(data={"total":tol, "docs": docs})
145
  except Exception as e:
146
  return server_error_response(e)
147
 
@@ -217,7 +217,7 @@ def rm():
217
  return get_data_error_result(retmsg="Tenant not found!")
218
  ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
219
 
220
- DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0)
221
  if not DocumentService.delete_by_id(req["doc_id"]):
222
  return get_data_error_result(
223
  retmsg="Database error (Document removal)!")
@@ -241,7 +241,7 @@ def run():
241
  info["chunk_num"] = 0
242
  info["token_num"] = 0
243
  DocumentService.update_by_id(id, info)
244
- #if str(req["run"]) == TaskStatus.CANCEL.value:
245
  tenant_id = DocumentService.get_tenant_id(id)
246
  if not tenant_id:
247
  return get_data_error_result(retmsg="Tenant not found!")
@@ -281,7 +281,7 @@ def rename():
281
 
282
 
283
  @manager.route('/get/<doc_id>', methods=['GET'])
284
- #@login_required
285
  def get(doc_id):
286
  try:
287
  e, doc = DocumentService.get_by_id(doc_id)
@@ -292,8 +292,9 @@ def get(doc_id):
292
  ext = re.search(r"\.([^.]+)$", doc.name)
293
  if ext:
294
  if doc.type == FileType.VISUAL.value:
295
- response.headers.set('Content-Type', 'image/%s'%ext.group(1))
296
- else: response.headers.set('Content-Type', 'application/%s'%ext.group(1))
 
297
  return response
298
  except Exception as e:
299
  return server_error_response(e)
@@ -314,11 +315,14 @@ def change_parser():
314
  if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
315
  return get_data_error_result(retmsg="Not supported yet!")
316
 
317
- e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": "0"})
 
 
318
  if not e:
319
  return get_data_error_result(retmsg="Document not found!")
320
- if doc.token_num>0:
321
- e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
 
322
  if not e:
323
  return get_data_error_result(retmsg="Document not found!")
324
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
@@ -332,7 +336,7 @@ def change_parser():
332
 
333
 
334
  @manager.route('/image/<image_id>', methods=['GET'])
335
- #@login_required
336
  def get_image(image_id):
337
  try:
338
  bkt, nm = image_id.split("-")
@@ -341,4 +345,3 @@ def get_image(image_id):
341
  return response
342
  except Exception as e:
343
  return server_error_response(e)
344
-
 
141
  try:
142
  docs, tol = DocumentService.get_by_kb_id(
143
  kb_id, page_number, items_per_page, orderby, desc, keywords)
144
+ return get_json_result(data={"total": tol, "docs": docs})
145
  except Exception as e:
146
  return server_error_response(e)
147
 
 
217
  return get_data_error_result(retmsg="Tenant not found!")
218
  ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
219
 
220
+ DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
221
  if not DocumentService.delete_by_id(req["doc_id"]):
222
  return get_data_error_result(
223
  retmsg="Database error (Document removal)!")
 
241
  info["chunk_num"] = 0
242
  info["token_num"] = 0
243
  DocumentService.update_by_id(id, info)
244
+ # if str(req["run"]) == TaskStatus.CANCEL.value:
245
  tenant_id = DocumentService.get_tenant_id(id)
246
  if not tenant_id:
247
  return get_data_error_result(retmsg="Tenant not found!")
 
281
 
282
 
283
  @manager.route('/get/<doc_id>', methods=['GET'])
284
+ # @login_required
285
  def get(doc_id):
286
  try:
287
  e, doc = DocumentService.get_by_id(doc_id)
 
292
  ext = re.search(r"\.([^.]+)$", doc.name)
293
  if ext:
294
  if doc.type == FileType.VISUAL.value:
295
+ response.headers.set('Content-Type', 'image/%s' % ext.group(1))
296
+ else:
297
+ response.headers.set('Content-Type', 'application/%s' % ext.group(1))
298
  return response
299
  except Exception as e:
300
  return server_error_response(e)
 
315
  if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
316
  return get_data_error_result(retmsg="Not supported yet!")
317
 
318
+ e = DocumentService.update_by_id(doc.id,
319
+ {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0",
320
+ "token_num": 0, "chunk_num": 0, "process_duation": 0})
321
  if not e:
322
  return get_data_error_result(retmsg="Document not found!")
323
+ if doc.token_num > 0:
324
+ e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
325
+ doc.process_duation * -1)
326
  if not e:
327
  return get_data_error_result(retmsg="Document not found!")
328
  tenant_id = DocumentService.get_tenant_id(req["doc_id"])
 
336
 
337
 
338
  @manager.route('/image/<image_id>', methods=['GET'])
339
+ # @login_required
340
  def get_image(image_id):
341
  try:
342
  bkt, nm = image_id.split("-")
 
345
  return response
346
  except Exception as e:
347
  return server_error_response(e)
 
deepdoc/parser/pdf_parser.py CHANGED
@@ -348,6 +348,9 @@ class HuParser:
348
  if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
349
  bxs.pop(i)
350
  continue
 
 
 
351
  concatting_feats = [
352
  b["text"].strip()[-1] in ",;:'\",、‘“;:-",
353
  len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
@@ -856,7 +859,7 @@ class HuParser:
856
  pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
857
  return len(pdf)
858
 
859
- def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
860
  self.lefted_chars = []
861
  self.mean_height = []
862
  self.mean_width = []
@@ -917,6 +920,7 @@ class HuParser:
917
  # self.page_cum_height.append(
918
  # np.max([c["bottom"] for c in chars]))
919
  self.__ocr(i + 1, img, chars, zoomin)
 
920
 
921
  if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
922
  bxes = [b for bxs in self.boxes for b in bxs]
 
348
  if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
349
  bxs.pop(i)
350
  continue
351
+ if not b["text"].strip():
352
+ bxs.pop(i)
353
+ continue
354
  concatting_feats = [
355
  b["text"].strip()[-1] in ",;:'\",、‘“;:-",
356
  len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
 
859
  pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
860
  return len(pdf)
861
 
862
+ def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
863
  self.lefted_chars = []
864
  self.mean_height = []
865
  self.mean_width = []
 
920
  # self.page_cum_height.append(
921
  # np.max([c["bottom"] for c in chars]))
922
  self.__ocr(i + 1, img, chars, zoomin)
923
+ if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
924
 
925
  if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
926
  bxes = [b for bxs in self.boxes for b in bxs]
docker/.env CHANGED
@@ -16,11 +16,13 @@ MEM_LIMIT=4073741824
16
  MYSQL_PASSWORD=infini_rag_flow
17
  MYSQL_PORT=5455
18
 
19
- MINIO_USER=rag_flow
20
  MINIO_PASSWORD=infini_rag_flow
21
 
22
  SVR_HTTP_PORT=9380
23
 
 
 
24
  ######## OS setup for ES ###########
25
  # sysctl vm.max_map_count
26
  # sudo sysctl -w vm.max_map_count=262144
 
16
  MYSQL_PASSWORD=infini_rag_flow
17
  MYSQL_PORT=5455
18
 
19
+ MINIO_USER=infiniflow
20
  MINIO_PASSWORD=infini_rag_flow
21
 
22
  SVR_HTTP_PORT=9380
23
 
24
+ TIMEZONE='Asia/Shanghai'
25
+
26
  ######## OS setup for ES ###########
27
  # sysctl vm.max_map_count
28
  # sudo sysctl -w vm.max_map_count=262144
docker/docker-compose.yml CHANGED
@@ -14,6 +14,7 @@ services:
14
  - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
15
  - bootstrap.memory_lock=false
16
  - xpack.security.enabled=false
 
17
  mem_limit: ${MEM_LIMIT}
18
  ulimits:
19
  memlock:
@@ -41,6 +42,7 @@ services:
41
  environment:
42
  - SERVERNAME=kibana
43
  - ELASTICSEARCH_HOSTS=http://es01:9200
 
44
  mem_limit: ${MEM_LIMIT}
45
  networks:
46
  - ragflow
@@ -50,7 +52,7 @@ services:
50
  container_name: ragflow-mysql
51
  environment:
52
  - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
53
- - TZ="Asia/Shanghai"
54
  command:
55
  --max_connections=1000
56
  --character-set-server=utf8mb4
@@ -83,6 +85,7 @@ services:
83
  environment:
84
  - MINIO_ROOT_USER=${MINIO_USER}
85
  - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
 
86
  volumes:
87
  - minio_data:/data
88
  networks:
@@ -108,6 +111,8 @@ services:
108
  - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
109
  - ./nginx/proxy.conf:/etc/nginx/proxy.conf
110
  - ./nginx/nginx.conf:/etc/nginx/nginx.conf
 
 
111
  networks:
112
  - ragflow
113
  restart: always
 
14
  - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
15
  - bootstrap.memory_lock=false
16
  - xpack.security.enabled=false
17
+ - TZ=${TIMEZONE}
18
  mem_limit: ${MEM_LIMIT}
19
  ulimits:
20
  memlock:
 
42
  environment:
43
  - SERVERNAME=kibana
44
  - ELASTICSEARCH_HOSTS=http://es01:9200
45
+ - TZ=${TIMEZONE}
46
  mem_limit: ${MEM_LIMIT}
47
  networks:
48
  - ragflow
 
52
  container_name: ragflow-mysql
53
  environment:
54
  - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
55
+ - TZ=${TIMEZONE}
56
  command:
57
  --max_connections=1000
58
  --character-set-server=utf8mb4
 
85
  environment:
86
  - MINIO_ROOT_USER=${MINIO_USER}
87
  - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
88
+ - TZ=${TIMEZONE}
89
  volumes:
90
  - minio_data:/data
91
  networks:
 
111
  - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
112
  - ./nginx/proxy.conf:/etc/nginx/proxy.conf
113
  - ./nginx/nginx.conf:/etc/nginx/nginx.conf
114
+ environment:
115
+ - TZ=${TIMEZONE}
116
  networks:
117
  - ragflow
118
  restart: always
rag/app/book.py CHANGED
@@ -26,26 +26,27 @@ class Pdf(PdfParser):
26
  filename if not binary else binary,
27
  zoomin,
28
  from_page,
29
- to_page)
30
- callback(0.1, "OCR finished")
 
31
 
32
  from timeit import default_timer as timer
33
  start = timer()
34
  self._layouts_rec(zoomin)
35
- callback(0.47, "Layout analysis finished")
36
  print("paddle layouts:", timer() - start)
37
  self._table_transformer_job(zoomin)
38
  callback(0.68, "Table analysis finished")
39
  self._text_merge()
40
- self._concat_downward(concat_between_pages=False)
 
41
  self._filter_forpages()
42
  self._merge_with_same_bullet()
43
  callback(0.75, "Text merging finished.")
44
- tbls = self._extract_table_figure(True, zoomin, True, True)
45
 
46
  callback(0.8, "Text extraction finished")
47
 
48
- return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls, tbl_poss
49
 
50
 
51
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -92,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
92
  bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
93
  if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
94
  else:
95
- sections = [s.split("@") for s in sections]
96
  sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
97
  cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
98
 
@@ -116,6 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
116
 
117
  if __name__ == "__main__":
118
  import sys
119
- def dummy(a, b):
120
  pass
121
  chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
 
26
  filename if not binary else binary,
27
  zoomin,
28
  from_page,
29
+ to_page,
30
+ callback)
31
+ callback("OCR finished")
32
 
33
  from timeit import default_timer as timer
34
  start = timer()
35
  self._layouts_rec(zoomin)
36
+ callback(0.67, "Layout analysis finished")
37
  print("paddle layouts:", timer() - start)
38
  self._table_transformer_job(zoomin)
39
  callback(0.68, "Table analysis finished")
40
  self._text_merge()
41
+ tbls = self._extract_table_figure(True, zoomin, True, True)
42
+ self._naive_vertical_merge()
43
  self._filter_forpages()
44
  self._merge_with_same_bullet()
45
  callback(0.75, "Text merging finished.")
 
46
 
47
  callback(0.8, "Text extraction finished")
48
 
49
+ return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
50
 
51
 
52
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
93
  bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
94
  if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
95
  else:
96
+ sections = [s.split("@") for s,_ in sections]
97
  sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
98
  cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
99
 
 
117
 
118
  if __name__ == "__main__":
119
  import sys
120
+ def dummy(prog=None, msg=""):
121
  pass
122
  chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
rag/app/laws.py CHANGED
@@ -54,13 +54,15 @@ class Pdf(PdfParser):
54
  filename if not binary else binary,
55
  zoomin,
56
  from_page,
57
- to_page)
58
- callback(0.1, "OCR finished")
 
 
59
 
60
  from timeit import default_timer as timer
61
  start = timer()
62
  self._layouts_rec(zoomin)
63
- callback(0.77, "Layout analysis finished")
64
  cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
65
  self._naive_vertical_merge()
66
 
 
54
  filename if not binary else binary,
55
  zoomin,
56
  from_page,
57
+ to_page,
58
+ callback
59
+ )
60
+ callback("OCR finished")
61
 
62
  from timeit import default_timer as timer
63
  start = timer()
64
  self._layouts_rec(zoomin)
65
+ callback(0.67, "Layout analysis finished")
66
  cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
67
  self._naive_vertical_merge()
68
 
rag/app/manual.py CHANGED
@@ -19,20 +19,22 @@ class Pdf(PdfParser):
19
  filename if not binary else binary,
20
  zoomin,
21
  from_page,
22
- to_page)
23
- callback(0.2, "OCR finished.")
 
 
24
 
25
  from timeit import default_timer as timer
26
  start = timer()
27
  self._layouts_rec(zoomin)
28
- callback(0.5, "Layout analysis finished.")
29
  print("paddle layouts:", timer() - start)
30
  self._table_transformer_job(zoomin)
31
- callback(0.7, "Table analysis finished.")
32
  self._text_merge()
33
  self._concat_downward(concat_between_pages=False)
34
  self._filter_forpages()
35
- callback(0.77, "Text merging finished")
36
  tbls = self._extract_table_figure(True, zoomin, True, True)
37
 
38
  # clean mess
 
19
  filename if not binary else binary,
20
  zoomin,
21
  from_page,
22
+ to_page,
23
+ callback
24
+ )
25
+ callback("OCR finished.")
26
 
27
  from timeit import default_timer as timer
28
  start = timer()
29
  self._layouts_rec(zoomin)
30
+ callback(0.65, "Layout analysis finished.")
31
  print("paddle layouts:", timer() - start)
32
  self._table_transformer_job(zoomin)
33
+ callback(0.67, "Table analysis finished.")
34
  self._text_merge()
35
  self._concat_downward(concat_between_pages=False)
36
  self._filter_forpages()
37
+ callback(0.68, "Text merging finished")
38
  tbls = self._extract_table_figure(True, zoomin, True, True)
39
 
40
  # clean mess
rag/app/naive.py CHANGED
@@ -26,24 +26,24 @@ class Pdf(PdfParser):
26
  filename if not binary else binary,
27
  zoomin,
28
  from_page,
29
- to_page)
30
- callback(0.1, "OCR finished")
 
 
31
 
32
  from timeit import default_timer as timer
33
  start = timer()
34
  self._layouts_rec(zoomin)
35
- callback(0.5, "Layout analysis finished.")
36
  print("paddle layouts:", timer() - start)
37
  self._table_transformer_job(zoomin)
38
- callback(0.7, "Table analysis finished.")
39
  self._text_merge()
40
- self._concat_downward(concat_between_pages=False)
41
- self._filter_forpages()
42
- callback(0.77, "Text merging finished")
43
  tbls = self._extract_table_figure(True, zoomin, True, True)
 
44
 
45
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
46
- #self._naive_vertical_merge()
47
  return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
48
 
49
 
 
26
  filename if not binary else binary,
27
  zoomin,
28
  from_page,
29
+ to_page,
30
+ callback
31
+ )
32
+ callback("OCR finished")
33
 
34
  from timeit import default_timer as timer
35
  start = timer()
36
  self._layouts_rec(zoomin)
37
+ callback(0.63, "Layout analysis finished.")
38
  print("paddle layouts:", timer() - start)
39
  self._table_transformer_job(zoomin)
40
+ callback(0.65, "Table analysis finished.")
41
  self._text_merge()
42
+ callback(0.67, "Text merging finished")
 
 
43
  tbls = self._extract_table_figure(True, zoomin, True, True)
44
+ self._naive_vertical_merge()
45
 
46
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
 
47
  return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
48
 
49
 
rag/app/paper.py CHANGED
@@ -33,13 +33,15 @@ class Pdf(PdfParser):
33
  filename if not binary else binary,
34
  zoomin,
35
  from_page,
36
- to_page)
37
- callback(0.2, "OCR finished.")
 
 
38
 
39
  from timeit import default_timer as timer
40
  start = timer()
41
  self._layouts_rec(zoomin)
42
- callback(0.47, "Layout analysis finished")
43
  print("paddle layouts:", timer() - start)
44
  self._table_transformer_job(zoomin)
45
  callback(0.68, "Table analysis finished")
 
33
  filename if not binary else binary,
34
  zoomin,
35
  from_page,
36
+ to_page,
37
+ callback
38
+ )
39
+ callback("OCR finished.")
40
 
41
  from timeit import default_timer as timer
42
  start = timer()
43
  self._layouts_rec(zoomin)
44
+ callback(0.63, "Layout analysis finished")
45
  print("paddle layouts:", timer() - start)
46
  self._table_transformer_job(zoomin)
47
  callback(0.68, "Table analysis finished")
rag/app/presentation.py CHANGED
@@ -49,7 +49,7 @@ class Pdf(PdfParser):
49
 
50
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
51
  callback(msg="OCR is running...")
52
- self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
53
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
54
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
55
  res = []
 
49
 
50
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
51
  callback(msg="OCR is running...")
52
+ self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
53
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
54
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
55
  res = []
rag/llm/embedding_model.py CHANGED
@@ -56,6 +56,7 @@ class HuEmbedding(Base):
56
 
57
 
58
  def encode(self, texts: list, batch_size=32):
 
59
  token_count = 0
60
  for t in texts: token_count += num_tokens_from_string(t)
61
  res = []
 
56
 
57
 
58
  def encode(self, texts: list, batch_size=32):
59
+ texts = [t[:2000] for t in texts]
60
  token_count = 0
61
  for t in texts: token_count += num_tokens_from_string(t)
62
  res = []
rag/nlp/__init__.py CHANGED
@@ -114,6 +114,7 @@ def add_positions(d, poss):
114
  d["page_num_int"].append(pn+1)
115
  d["top_int"].append(top)
116
  d["position_int"].append((pn+1, left, right, top, bottom))
 
117
 
118
 
119
  def remove_contents_table(sections, eng=False):
@@ -172,7 +173,7 @@ def hierarchical_merge(bull, sections, depth):
172
 
173
  def not_title(txt):
174
  if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
175
- if len(txt) >= 128: return True
176
  return re.search(r"[,;,。;!!]", txt)
177
 
178
  for i, (txt, layout) in enumerate(sections):
@@ -181,12 +182,12 @@ def hierarchical_merge(bull, sections, depth):
181
  levels[j].append(i)
182
  break
183
  else:
184
- if re.search(r"(title|head)", layout):
185
  levels[bullets_size].append(i)
186
  else:
187
  levels[bullets_size + 1].append(i)
188
  sections = [t for t, _ in sections]
189
- for s in sections: print("--", s)
190
 
191
  def binary_search(arr, target):
192
  if not arr: return -1
@@ -220,11 +221,29 @@ def hierarchical_merge(bull, sections, depth):
220
  if jj > cks[-1][-1]: cks[-1].pop(-1)
221
  cks[-1].append(levels[ii][jj])
222
  for ii in cks[-1]: readed[ii] = True
 
 
 
223
  for i in range(len(cks)):
224
  cks[i] = [sections[j] for j in cks[i][::-1]]
225
  print("--------------\n", "\n* ".join(cks[i]))
226
 
227
- return cks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
 
230
  def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
 
114
  d["page_num_int"].append(pn+1)
115
  d["top_int"].append(top)
116
  d["position_int"].append((pn+1, left, right, top, bottom))
117
+ d["top_int"] = d["top_int"][:1]
118
 
119
 
120
  def remove_contents_table(sections, eng=False):
 
173
 
174
  def not_title(txt):
175
  if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
176
+ if len(txt.split(" "))>12 or (txt.find(" ")<0 and len(txt)) >= 32: return True
177
  return re.search(r"[,;,。;!!]", txt)
178
 
179
  for i, (txt, layout) in enumerate(sections):
 
182
  levels[j].append(i)
183
  break
184
  else:
185
+ if re.search(r"(title|head)", layout) and not not_title(txt):
186
  levels[bullets_size].append(i)
187
  else:
188
  levels[bullets_size + 1].append(i)
189
  sections = [t for t, _ in sections]
190
+ #for s in sections: print("--", s)
191
 
192
  def binary_search(arr, target):
193
  if not arr: return -1
 
221
  if jj > cks[-1][-1]: cks[-1].pop(-1)
222
  cks[-1].append(levels[ii][jj])
223
  for ii in cks[-1]: readed[ii] = True
224
+
225
+ if not cks:return cks
226
+
227
  for i in range(len(cks)):
228
  cks[i] = [sections[j] for j in cks[i][::-1]]
229
  print("--------------\n", "\n* ".join(cks[i]))
230
 
231
+ res = [[]]
232
+ num = [0]
233
+ for ck in cks:
234
+ if len(ck) == 1:
235
+ n = num_tokens_from_string(re.sub(r"@@[0-9]+.*", "", ck[0]))
236
+ if n + num[-1] < 218:
237
+ res[-1].append(ck[0])
238
+ num[-1] += n
239
+ continue
240
+ res.append(ck)
241
+ num.append(n)
242
+ continue
243
+ res.append(ck)
244
+ num.append(218)
245
+
246
+ return res
247
 
248
 
249
  def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
rag/svr/task_broker.py CHANGED
@@ -46,7 +46,7 @@ def collect(tm):
46
  def set_dispatching(docid):
47
  try:
48
  DocumentService.update_by_id(
49
- docid, {"progress": random.randint(0, 3) / 100.,
50
  "progress_msg": "Task dispatched...",
51
  "process_begin_at": get_format_time()
52
  })
 
46
  def set_dispatching(docid):
47
  try:
48
  DocumentService.update_by_id(
49
+ docid, {"progress": random.random()*1 / 100.,
50
  "progress_msg": "Task dispatched...",
51
  "process_begin_at": get_format_time()
52
  })
rag/svr/task_executor.py CHANGED
@@ -72,7 +72,8 @@ def set_progress(task_id, from_page=0, to_page=-1,
72
  prog = -1
73
 
74
  if to_page > 0:
75
- msg = f"Page({from_page}~{to_page}): " + msg
 
76
  d = {"progress_msg": msg}
77
  if prog is not None:
78
  d["progress"] = prog
@@ -168,7 +169,7 @@ def init_kb(row):
168
  open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
169
 
170
 
171
- def embedding(docs, mdl, parser_config={}):
172
  tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
173
  d["content_with_weight"] for d in docs]
174
  tk_count = 0
@@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}):
176
  tts, c = mdl.encode(tts)
177
  tk_count += c
178
 
179
- cnts, c = mdl.encode(cnts)
180
- tk_count += c
 
 
 
 
 
 
181
  title_w = float(parser_config.get("filename_embd_weight", 0.1))
182
  vects = (title_w * tts + (1 - title_w) *
183
  cnts) if len(tts) == len(cnts) else cnts
@@ -218,10 +225,11 @@ def main(comm, mod):
218
  # TODO: exception handler
219
  ## set_progress(r["did"], -1, "ERROR: ")
220
  try:
221
- tk_count = embedding(cks, embd_mdl, r["parser_config"])
222
  except Exception as e:
223
  callback(-1, "Embedding error:{}".format(str(e)))
224
  cron_logger.error(str(e))
 
225
 
226
  callback(msg="Finished embedding! Start to build index!")
227
  init_kb(r)
 
72
  prog = -1
73
 
74
  if to_page > 0:
75
+ if msg:
76
+ msg = f"Page({from_page}~{to_page}): " + msg
77
  d = {"progress_msg": msg}
78
  if prog is not None:
79
  d["progress"] = prog
 
169
  open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
170
 
171
 
172
+ def embedding(docs, mdl, parser_config={}, callback=None):
173
  tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
174
  d["content_with_weight"] for d in docs]
175
  tk_count = 0
 
177
  tts, c = mdl.encode(tts)
178
  tk_count += c
179
 
180
+ cnts_ = []
181
+ for i in range(0, len(cnts), 32):
182
+ vts, c = mdl.encode(cnts[i: i+32])
183
+ cnts_.extend(vts)
184
+ tk_count += c
185
+ callback(msg="")
186
+ cnts = cnts_
187
+
188
  title_w = float(parser_config.get("filename_embd_weight", 0.1))
189
  vects = (title_w * tts + (1 - title_w) *
190
  cnts) if len(tts) == len(cnts) else cnts
 
225
  # TODO: exception handler
226
  ## set_progress(r["did"], -1, "ERROR: ")
227
  try:
228
+ tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
229
  except Exception as e:
230
  callback(-1, "Embedding error:{}".format(str(e)))
231
  cron_logger.error(str(e))
232
+ tk_count = 0
233
 
234
  callback(msg="Finished embedding! Start to build index!")
235
  init_kb(r)