| from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap | |
| from magic_pdf.libs.drop_reason import DropReason | |
| def _remove_overlap_between_bbox(bbox1, bbox2): | |
| if _is_part_overlap(bbox1, bbox2): | |
| ix0, iy0, ix1, iy1 = bbox1 | |
| x0, y0, x1, y1 = bbox2 | |
| diff_x = min(x1, ix1) - max(x0, ix0) | |
| diff_y = min(y1, iy1) - max(y0, iy0) | |
| if diff_y > diff_x: | |
| if x1 >= ix1: | |
| mid = (x0 + ix1) // 2 | |
| ix1 = min(mid - 0.25, ix1) | |
| x0 = max(mid + 0.25, x0) | |
| else: | |
| mid = (ix0 + x1) // 2 | |
| ix0 = max(mid + 0.25, ix0) | |
| x1 = min(mid - 0.25, x1) | |
| else: | |
| if y1 >= iy1: | |
| mid = (y0 + iy1) // 2 | |
| y0 = max(mid + 0.25, y0) | |
| iy1 = min(iy1, mid-0.25) | |
| else: | |
| mid = (iy0 + y1) // 2 | |
| y1 = min(y1, mid-0.25) | |
| iy0 = max(mid + 0.25, iy0) | |
| if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0: | |
| bbox1 = [ix0, iy0, ix1, iy1] | |
| bbox2 = [x0, y0, x1, y1] | |
| return bbox1, bbox2, None | |
| else: | |
| return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA | |
| else: | |
| return bbox1, bbox2, None | |
| def _remove_overlap_between_bboxes(arr): | |
| drop_reasons = [] | |
| N = len(arr) | |
| keeps = [True] * N | |
| res = [None] * N | |
| for i in range(N): | |
| for j in range(N): | |
| if i == j: | |
| continue | |
| if _is_in(arr[i]["bbox"], arr[j]["bbox"]): | |
| keeps[i] = False | |
| for idx, v in enumerate(arr): | |
| if not keeps[idx]: | |
| continue | |
| for i in range(N): | |
| if res[i] is None: | |
| continue | |
| bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"]) | |
| if drop_reason is None: | |
| v["bbox"] = bbox1 | |
| res[i]["bbox"] = bbox2 | |
| else: | |
| if v["score"] > res[i]["score"]: | |
| keeps[i] = False | |
| res[i] = None | |
| else: | |
| keeps[idx] = False | |
| drop_reasons.append(drop_reasons) | |
| if keeps[idx]: | |
| res[idx] = v | |
| return res, drop_reasons | |
| def remove_overlap_between_bbox_for_span(spans): | |
| arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ] | |
| res, drop_reasons = _remove_overlap_between_bboxes(arr) | |
| ret = [] | |
| for i in range(len(res)): | |
| if res[i] is None: | |
| continue | |
| spans[i]["bbox"] = res[i]["bbox"] | |
| ret.append(spans[i]) | |
| return ret, drop_reasons | |
| def remove_overlap_between_bbox_for_block(all_bboxes): | |
| arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ] | |
| res, drop_reasons = _remove_overlap_between_bboxes(arr) | |
| ret = [] | |
| for i in range(len(res)): | |
| if res[i] is None: | |
| continue | |
| all_bboxes[i][:4] = res[i]["bbox"] | |
| ret.append(all_bboxes[i]) | |
| return ret, drop_reasons | |