Kevin Hu commited on
Commit
1955ace
·
1 Parent(s): 54cd6b8

Fix t_recognizer.py after model updating. (#4330)

Browse files

### What problem does this PR solve?

#4230

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. deepdoc/vision/t_recognizer.py +11 -18
deepdoc/vision/t_recognizer.py CHANGED
@@ -23,8 +23,7 @@ sys.path.insert(
23
  '../../')))
24
 
25
  from deepdoc.vision.seeit import draw_box
26
- from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
27
- from api.utils.file_utils import get_project_base_directory
28
  import argparse
29
  import re
30
  import numpy as np
@@ -33,13 +32,7 @@ import numpy as np
33
  def main(args):
34
  images, outputs = init_in_out(args)
35
  if args.mode.lower() == "layout":
36
- labels = LayoutRecognizer.labels
37
- detr = Recognizer(
38
- labels,
39
- "layout",
40
- os.path.join(
41
- get_project_base_directory(),
42
- "rag/res/deepdoc/"))
43
  if args.mode.lower() == "tsr":
44
  labels = TableStructureRecognizer.labels
45
  detr = TableStructureRecognizer()
@@ -64,7 +57,7 @@ def main(args):
64
 
65
  def get_table_html(img, tb_cpns, ocr):
66
  boxes = ocr(np.array(img))
67
- boxes = Recognizer.sort_Y_firstly(
68
  [{"x0": b[0][0], "x1": b[1][0],
69
  "top": b[0][1], "text": t[0],
70
  "bottom": b[-1][1],
@@ -75,26 +68,26 @@ def get_table_html(img, tb_cpns, ocr):
75
 
76
  def gather(kwd, fzy=10, ption=0.6):
77
  nonlocal boxes
78
- eles = Recognizer.sort_Y_firstly(
79
  [r for r in tb_cpns if re.match(kwd, r["label"])], fzy)
80
- eles = Recognizer.layouts_cleanup(boxes, eles, 5, ption)
81
- return Recognizer.sort_Y_firstly(eles, 0)
82
 
83
  headers = gather(r".*header$")
84
  rows = gather(r".* (row|header)")
85
  spans = gather(r".*spanning")
86
  clmns = sorted([r for r in tb_cpns if re.match(
87
  r"table column$", r["label"])], key=lambda x: x["x0"])
88
- clmns = Recognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
89
 
90
  for b in boxes:
91
- ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
92
  if ii is not None:
93
  b["R"] = ii
94
  b["R_top"] = rows[ii]["top"]
95
  b["R_bott"] = rows[ii]["bottom"]
96
 
97
- ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
98
  if ii is not None:
99
  b["H_top"] = headers[ii]["top"]
100
  b["H_bott"] = headers[ii]["bottom"]
@@ -102,13 +95,13 @@ def get_table_html(img, tb_cpns, ocr):
102
  b["H_right"] = headers[ii]["x1"]
103
  b["H"] = ii
104
 
105
- ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
106
  if ii is not None:
107
  b["C"] = ii
108
  b["C_left"] = clmns[ii]["x0"]
109
  b["C_right"] = clmns[ii]["x1"]
110
 
111
- ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
112
  if ii is not None:
113
  b["H_top"] = spans[ii]["top"]
114
  b["H_bott"] = spans[ii]["bottom"]
 
23
  '../../')))
24
 
25
  from deepdoc.vision.seeit import draw_box
26
+ from deepdoc.vision import LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
 
27
  import argparse
28
  import re
29
  import numpy as np
 
32
  def main(args):
33
  images, outputs = init_in_out(args)
34
  if args.mode.lower() == "layout":
35
+ detr = LayoutRecognizer("layout")
 
 
 
 
 
 
36
  if args.mode.lower() == "tsr":
37
  labels = TableStructureRecognizer.labels
38
  detr = TableStructureRecognizer()
 
57
 
58
  def get_table_html(img, tb_cpns, ocr):
59
  boxes = ocr(np.array(img))
60
+ boxes = LayoutRecognizer.sort_Y_firstly(
61
  [{"x0": b[0][0], "x1": b[1][0],
62
  "top": b[0][1], "text": t[0],
63
  "bottom": b[-1][1],
 
68
 
69
  def gather(kwd, fzy=10, ption=0.6):
70
  nonlocal boxes
71
+ eles = LayoutRecognizer.sort_Y_firstly(
72
  [r for r in tb_cpns if re.match(kwd, r["label"])], fzy)
73
+ eles = LayoutRecognizer.layouts_cleanup(boxes, eles, 5, ption)
74
+ return LayoutRecognizer.sort_Y_firstly(eles, 0)
75
 
76
  headers = gather(r".*header$")
77
  rows = gather(r".* (row|header)")
78
  spans = gather(r".*spanning")
79
  clmns = sorted([r for r in tb_cpns if re.match(
80
  r"table column$", r["label"])], key=lambda x: x["x0"])
81
+ clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
82
 
83
  for b in boxes:
84
+ ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
85
  if ii is not None:
86
  b["R"] = ii
87
  b["R_top"] = rows[ii]["top"]
88
  b["R_bott"] = rows[ii]["bottom"]
89
 
90
+ ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
91
  if ii is not None:
92
  b["H_top"] = headers[ii]["top"]
93
  b["H_bott"] = headers[ii]["bottom"]
 
95
  b["H_right"] = headers[ii]["x1"]
96
  b["H"] = ii
97
 
98
+ ii = LayoutRecognizer.find_horizontally_tightest_fit(b, clmns)
99
  if ii is not None:
100
  b["C"] = ii
101
  b["C_left"] = clmns[ii]["x0"]
102
  b["C_right"] = clmns[ii]["x1"]
103
 
104
+ ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
105
  if ii is not None:
106
  b["H_top"] = spans[ii]["top"]
107
  b["H_bott"] = spans[ii]["bottom"]