Commit
·
e347567
1
Parent(s):
ad31b8f
fix bug highligthing of enity when clicking on markdown text
Browse files
ocr.py
CHANGED
|
@@ -119,10 +119,11 @@ Layout Class:
|
|
| 119 |
|
| 120 |
|
| 121 |
class Layout:
|
| 122 |
-
def __init__(self, show_unknown: bool = False):
|
| 123 |
self.counts = {layout_type: 0 for layout_type in LayoutType}
|
| 124 |
self.records: dict[LayoutType, Any] = {layout_type: [] for layout_type in LayoutType}
|
| 125 |
self.recovery = """"""
|
|
|
|
| 126 |
self.show_unknown = show_unknown
|
| 127 |
|
| 128 |
def add(
|
|
@@ -145,7 +146,7 @@ class Layout:
|
|
| 145 |
"table": table,
|
| 146 |
})
|
| 147 |
if layout_type != LayoutType.UNKNOWN or self.show_unknown: # Discards the unknown layout types detections
|
| 148 |
-
path = f"recording://Image/{layout_type.type.title()}/{name.title()}"
|
| 149 |
self.recovery += f"\n\n## [{name.title()}]({path})\n\n" # Log Type as Heading
|
| 150 |
# Enhancement - Logged image for Figure type TODO(#6517)
|
| 151 |
if layout_type == LayoutType.TABLE:
|
|
@@ -153,7 +154,7 @@ class Layout:
|
|
| 153 |
self.recovery += table # Log details (table)
|
| 154 |
elif detections:
|
| 155 |
for index, detection in enumerate(detections):
|
| 156 |
-
path_text = f"recording://Image/{layout_type.type.title()}/{name.title()}/Detections/{index}"
|
| 157 |
self.recovery += f' [{detection["text"]}]({path_text})' # Log details (text)
|
| 158 |
else:
|
| 159 |
logging.warning(f"Invalid layout type detected: {layout_type}")
|
|
@@ -221,13 +222,14 @@ class Layout:
|
|
| 221 |
return f"Error processing the table: {str(e)}"
|
| 222 |
|
| 223 |
|
| 224 |
-
def process_layout_records(log_queue: SimpleQueue[Any], layout: Layout
|
| 225 |
paths, detections_paths = [], []
|
| 226 |
zoom_paths: list[rrb.Spatial2DView] = []
|
| 227 |
zoom_paths_figures: list[rrb.Spatial2DView] = []
|
| 228 |
zoom_paths_tables: list[rrb.Spatial2DView] = []
|
| 229 |
zoom_paths_texts: list[rrb.Spatial2DView] = []
|
| 230 |
|
|
|
|
| 231 |
for layout_type in LayoutType:
|
| 232 |
for record in layout.records[layout_type]:
|
| 233 |
record_name = record["name"].title()
|
|
@@ -327,11 +329,11 @@ def update_zoom_paths(
|
|
| 327 |
|
| 328 |
def generate_blueprint(
|
| 329 |
layouts: list[Layout],
|
| 330 |
-
page_paths: list[str],
|
| 331 |
processed_layouts: list[LayoutStructure],
|
| 332 |
) -> rrb.Blueprint:
|
| 333 |
page_tabs = []
|
| 334 |
-
for layout,
|
|
|
|
| 335 |
paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts = processed_layout
|
| 336 |
|
| 337 |
section_tabs = []
|
|
@@ -399,28 +401,28 @@ def detect_and_log_layouts(log_queue: SimpleQueue[Any], file_path: str, start_pa
|
|
| 399 |
|
| 400 |
# Extracte the layout from each image
|
| 401 |
layouts: list[Layout] = []
|
| 402 |
-
|
| 403 |
processed_layouts: list[LayoutStructure] = []
|
| 404 |
-
for i, (image,
|
| 405 |
-
layouts.append(detect_and_log_layout(log_queue, image,
|
| 406 |
|
| 407 |
# Generate and send a blueprint based on the detected layouts
|
| 408 |
processed_layouts.append(
|
| 409 |
process_layout_records(
|
| 410 |
log_queue,
|
| 411 |
layouts[-1],
|
| 412 |
-
page_path,
|
| 413 |
)
|
| 414 |
)
|
| 415 |
logging.info("Sending blueprint...")
|
| 416 |
-
blueprint = generate_blueprint(layouts,
|
| 417 |
log_queue.put(["blueprint", blueprint])
|
| 418 |
logging.info("Blueprint sent...")
|
| 419 |
|
| 420 |
|
| 421 |
-
def detect_and_log_layout(log_queue: SimpleQueue, coloured_image: npt.NDArray[np.uint8],
|
| 422 |
# Layout Object - This will contain the detected layouts and their detections
|
| 423 |
-
layout = Layout()
|
|
|
|
| 424 |
|
| 425 |
# Log Image and add Annotation Context
|
| 426 |
log_queue.put([
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
class Layout:
|
| 122 |
+
def __init__(self, page_number: int, show_unknown: bool = False):
|
| 123 |
self.counts = {layout_type: 0 for layout_type in LayoutType}
|
| 124 |
self.records: dict[LayoutType, Any] = {layout_type: [] for layout_type in LayoutType}
|
| 125 |
self.recovery = """"""
|
| 126 |
+
self.page_number = page_number
|
| 127 |
self.show_unknown = show_unknown
|
| 128 |
|
| 129 |
def add(
|
|
|
|
| 146 |
"table": table,
|
| 147 |
})
|
| 148 |
if layout_type != LayoutType.UNKNOWN or self.show_unknown: # Discards the unknown layout types detections
|
| 149 |
+
path = f"recording://page_{self.page_number}/Image/{layout_type.type.title()}/{name.title()}"
|
| 150 |
self.recovery += f"\n\n## [{name.title()}]({path})\n\n" # Log Type as Heading
|
| 151 |
# Enhancement - Logged image for Figure type TODO(#6517)
|
| 152 |
if layout_type == LayoutType.TABLE:
|
|
|
|
| 154 |
self.recovery += table # Log details (table)
|
| 155 |
elif detections:
|
| 156 |
for index, detection in enumerate(detections):
|
| 157 |
+
path_text = f"recording://page_{self.page_number}/Image/{layout_type.type.title()}/{name.title()}/Detections/{index}"
|
| 158 |
self.recovery += f' [{detection["text"]}]({path_text})' # Log details (text)
|
| 159 |
else:
|
| 160 |
logging.warning(f"Invalid layout type detected: {layout_type}")
|
|
|
|
| 222 |
return f"Error processing the table: {str(e)}"
|
| 223 |
|
| 224 |
|
| 225 |
+
def process_layout_records(log_queue: SimpleQueue[Any], layout: Layout) -> LayoutStructure:
|
| 226 |
paths, detections_paths = [], []
|
| 227 |
zoom_paths: list[rrb.Spatial2DView] = []
|
| 228 |
zoom_paths_figures: list[rrb.Spatial2DView] = []
|
| 229 |
zoom_paths_tables: list[rrb.Spatial2DView] = []
|
| 230 |
zoom_paths_texts: list[rrb.Spatial2DView] = []
|
| 231 |
|
| 232 |
+
page_path = f'page_{layout.page_number}'
|
| 233 |
for layout_type in LayoutType:
|
| 234 |
for record in layout.records[layout_type]:
|
| 235 |
record_name = record["name"].title()
|
|
|
|
| 329 |
|
| 330 |
def generate_blueprint(
|
| 331 |
layouts: list[Layout],
|
|
|
|
| 332 |
processed_layouts: list[LayoutStructure],
|
| 333 |
) -> rrb.Blueprint:
|
| 334 |
page_tabs = []
|
| 335 |
+
for layout, processed_layout in zip(layouts, processed_layouts):
|
| 336 |
+
page_path = f'page_{layout.page_number}'
|
| 337 |
paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts = processed_layout
|
| 338 |
|
| 339 |
section_tabs = []
|
|
|
|
| 401 |
|
| 402 |
# Extracte the layout from each image
|
| 403 |
layouts: list[Layout] = []
|
| 404 |
+
page_numbers = [i + start_page for i in range(len(images))]
|
| 405 |
processed_layouts: list[LayoutStructure] = []
|
| 406 |
+
for i, (image, page_number) in enumerate(zip(images, page_numbers)):
|
| 407 |
+
layouts.append(detect_and_log_layout(log_queue, image, page_number))
|
| 408 |
|
| 409 |
# Generate and send a blueprint based on the detected layouts
|
| 410 |
processed_layouts.append(
|
| 411 |
process_layout_records(
|
| 412 |
log_queue,
|
| 413 |
layouts[-1],
|
|
|
|
| 414 |
)
|
| 415 |
)
|
| 416 |
logging.info("Sending blueprint...")
|
| 417 |
+
blueprint = generate_blueprint(layouts, processed_layouts)
|
| 418 |
log_queue.put(["blueprint", blueprint])
|
| 419 |
logging.info("Blueprint sent...")
|
| 420 |
|
| 421 |
|
| 422 |
+
def detect_and_log_layout(log_queue: SimpleQueue, coloured_image: npt.NDArray[np.uint8], page_number: int) -> Layout:
|
| 423 |
# Layout Object - This will contain the detected layouts and their detections
|
| 424 |
+
layout = Layout(page_number)
|
| 425 |
+
page_path = f'page_{page_number}'
|
| 426 |
|
| 427 |
# Log Image and add Annotation Context
|
| 428 |
log_queue.put([
|